#library reticulate to be able to add chunks in another language
#library(reticulate)
#To run this .rmd file in a terminal being in the project folder:
#export PATH=${PATH}:/cm/shared/apps/R/deps/rstudio/bin/pandoc
#file="./report/Variant_to_Gene_Report.Rmd"
#Rscript -e 'rmarkdown::render("'$file'")'
Identify and prioritise genes using variant-to-gene mapping tools
with credible sets variants from fine-mapping results of GWAS on severe
asthma in UK Biobank European individuals.
Master Code (overall
pipeline): src/Var_to_Gene_pipeline.sh
#!/usr/bin/env bash
#Rationale: pipeline for Variant to Gene mapping analysis. Output from each analysis: list of genes
module load R
#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"
################
#1.VARIANT ANNOTATION
################
#From FAVOR website: Two Info:
#Nearest gene for sentinel with highest PIP in each locus
#Functional annotated credset variants according to FANTOM5, ClinVar and Integrative Functional Score criteria
Rscript src/Variant_annotation_FAVOR.R input/FAVOR_credset_chrpos38_2023_08_08.csv
#Add nearest genes to the var2gene_raw.xlsx file.
################
#2.1 COLOCALISATION - eQTL
#GTExV8, eQTLGen, UBCLung
################
##Exclude chromosome 6 - no eQTL colocalisation for this locus.
##Genomic boundaries: PIP-max causal variant +/- 1Mb
cs=('SA_8_81292599_C_A' 'SA_6_90963614_AT_A' 'SA_5_110401872_C_T' 'SA_2_242692858_T_C' 'SA_15_67442596_T_C' 'SA_12_56435504_C_G' 'SA_11_76296671_G_A' 'SA_9_6209697_G_A' 'SA_5_131885240_C_G' 'SA_3_33042712_C_T' 'SA_2_102913642_AAAAC_A' 'SA_17_38168828_G_A' 'SA_16_27359021_C_A' 'SA_15_61068954_C_T' 'SA_12_48202941_T_C' 'SA_10_9064716_C_T')
cs_all="/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt"
###GTExv8 eQTL###
##Create eQTl files in hg19 for Colon Transverse, Colon Sigmoid, Skin_Not_Sun_Exposed_Suprapubic, Skin_Sun_Exposed_Lower_leg
#From GTExV8 .parquet files and in hg38.
mkdir ${tmp_path}/liftover_gtexv8
mkdir ${tmp_path}/liftover_gtexv8/bed
dos2unix src/coloc/000A_eqtl_gtex_extraction.R src/coloc/000B_eqtl_gtex_liftover.sh \
src/coloc/000C_eqtl_gtex_conversion.R src/coloc/000A_submit_eqtl_gtex_extraction.sh src/coloc/000C_submit_eqtl_gtex_conversion.sh
chmod o+x src/coloc/000A_eqtl_gtex_extraction.R src/coloc/000B_eqtl_gtex_liftover.sh \
src/coloc/000C_eqtl_gtex_conversion.R src/coloc/000A_submit_eqtl_gtex_extraction.sh src/coloc/000C_submit_eqtl_gtex_conversion.sh
sbatch --array=1-22 src/coloc/000A_submit_eqtl_gtex_extraction.sh
sbatch src/coloc/000B_eqtl_gtex_liftover.sh
sbatch --array=1-22 src/coloc/000C_submit_eqtl_gtex_extraction.sh
##variables needed:
tissue=('Stomach' 'Small_Intestine_Terminal_Ileum' 'Lung' 'Esophagus_Muscularis' 'Esophagus_Gastroesophageal_Junction' 'Artery_Tibial' 'Artery_Coronary' 'Artery_Aorta' 'Colon_Transverse' 'Colon_Sigmoid' 'Skin_Sun_Exposed_Lower_leg' 'Skin_Not_Sun_Exposed_Suprapubic')
##Divide the credible sets into separate files:
Rscript ./src/coloc/000_preprocess_cs.R $cs_all $tmp_path
#Obtain GWASpairs from credible set regions:
#.sh will run .R script:
for c in ${!cs[*]}; do
sbatch --export=CREDSET="${cs[c]}" ./src/coloc/001_submit_GWASpairs.sh
done
#Create files for GTExV8:
##'Colon Transverse' and 'Colon Sigmoid' miss from /data/gen1/ACEI/colocalisation_datasets/eQTL/GTeX/
##Need to create these
#.sh will run .R script:
for t in ${!tissue[*]}; do
for c in ${!cs[@]}; do
sbatch --export=TISSUE="${tissue[t]}",CREDSET="${cs[c]}" ./src/coloc/001_submit_eqtl_lookup_GTEx.sh
done
done
##Get the LD matrix:
##Create the file with gtex-locus pairs:
for t in ${!tissue[*]}; do Rscript ./src/coloc/002_prepare_LDinput.R "${tissue[t]}"; done
##Get LD:
#with parameters for GTExV8
sbatch ./src/coloc/002_get_LD.sh
##to see if errors in the job: grep "Error" /scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/ld-81713.out
##after adding Colon and Skin tissues, to see if errors in the job: grep "Error" /scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/ld-153659.out
##Run the colocalisation for GTExV8:
#.sh will run .R script:
mkdir ${tmp_path}/results
mkdir ${tmp_path}/results/gtex
#run for each Tissue:
tissue='Stomach'
tissue='Small_Intestine_Terminal_Ileum'
tissue='Lung'
tissue='Esophagus_Muscularis'
tissue='Esophagus_Gastroesophageal_Junction'
tissue='Artery_Tibial'
tissue='Artery_Coronary'
tissue='Artery_Aorta'
tissue='Colon_Transverse'
tissue='Colon_Sigmoid'
tissue='Skin_Sun_Exposed_Lower_leg'
tissue='Skin_Not_Sun_Exposed_Suprapubic'
for c in ${!cs[*]}; do
N=`cat ${tmp_path}${cs[c]}_${tissue}_genes.txt | wc -l`
sbatch --array=1-${N}%20 --export=TISSUE="${tissue}",CREDSET="${cs[c]}" ./src/coloc/003_submit_coloc_susie_GTEx.sh
sleep 5
done
######QUALITY CHECKS:
#to find number of genes:
#wc -l SA_*_${tissue}_genes.txt | sed 's/_/ /g' | sort -k 3,4 -g | awk '{print $1}'
#'Stomach' 575
#'Small_Intestine_Terminal_Ileum' 635
#'Lung' 633
#'Esophagus_Muscolaris' 564
#'Esophagus_Gastroesophageal_Junction' 576
#tissue='Artery_Tibial' 573
#tissue='Artery_Coronary' 603
#tissue='Artery_Aorta' 586
#tissue='Colon_Transverse' 610
#tissue='Colon_Sigmoid' 585
#tissue='Skin_Sun_Exposed_Lower_leg' 673
#tissue='Skin_Not_Sun_Exposed_Suprapubic' 670
##Check that all genes for each tissue have been analysed:
grep ${tissue} ${tmp_path}/logerror/coloc_susie_gtex*.out | awk -F ":" '{print $1}' | sort -u | wc -l
##Check how many genes per tissue have been analysed for colocalisation:
ls -lthr ${tmp_path}/results/gtex/*all_coloc.rds | grep ${tissue} | wc -l
ls -lthr ${tmp_path}/results/gtex/*all_susie*.rds | grep ${tissue} | wc -l
#gtex converted in gene symbol:
#https://www.biotools.fr/human/ensembl_symbol_converter
#added in GTExV8_eQTL_genes_symbol table in the input/var2gene.xlsx file.
awk 'NR ==1; $11 == "TRUE" {print $0}' ${tmp_path}/results/coloc_asthma_GTEx.tsv \
> output/coloc_asthma_GTEx.tsv
awk 'NR ==1; $16 == "TRUE" {print $0}' ${tmp_path}/results/colocsusie_asthma_GTEx.tsv \
> output/colocsusie_asthma_GTEx.tsv
###eqtlGen eQTL###
mkdir ${tmp_path}/results/eqtlgen
mkdir ${tmp_path}/eqtlgen
dos2unix src/coloc/000_submit_edit_eQTLGen.sh src/coloc/000_run_edit_eQTLGen.R
chmod +x src/coloc/000_submit_edit_eQTLGen.sh src/coloc/000_run_edit_eQTLGen.R
sbatch src/coloc/000_submit_edit_eQTLGen.sh
#Create files for eQTLGen colocalisation:
dos2unix src/coloc/001_submit_eqtl_lookup_eQTLGen.sh src/coloc/001_run_eqtl_lookup_eQTLGen.R
chmod +x src/coloc/001_submit_eqtl_lookup_eQTLGen.sh src/coloc/001_run_eqtl_lookup_eQTLGen.R
for c in ${!cs[@]}; do
sbatch --export=CREDSET="${cs[c]}" ./src/coloc/001_submit_eqtl_lookup_eQTLGen.sh
done
##Get the LD matrix:
##Create the file with gtex-locus pairs:
dos2unix src/coloc/002_prepare_LDinput_eqtlgen.R
chmod +x src/coloc/002_prepare_LDinput_eqtlgen.R
Rscript src/coloc/002_prepare_LDinput_eqtlgen.R "eqtlGenWB"
##Get LD:
#with parameters for eqtlGen
sbatch src/coloc/002_get_LD.sh
##to see if errors in the job: grep "Error" /scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/ld-170767.out
##Run the colocalisation for GTExV8:
#.sh will run .R script:
mkdir ${tmp_path}/results/eqtlgen
dos2unix src/coloc/003_submit_coloc_susie_eQTLGen.sh
chmod +x src/coloc/003_run_coloc_susie_eQTLGen.R
for c in ${!cs[*]}; do
N=`cat ${tmp_path}eqtlgen/${cs[c]}_eqtlGenWB_genes.txt | wc -l`
sbatch --array=1-${N}%20 --export=CREDSET="${cs[c]}" ./src/coloc/003_submit_coloc_susie_eQTLGen.sh
sleep 5
done
######QUALITY CHECKS:
#to find number of genes: 468
wc -l ${tmp_path}/eqtlgen/SA_*_eqtlGenWB_genes.txt | sed 's/_/ /g' | sort -k 3,4 -g | awk '{print $1}'
##Check that all genes for each tissue have been analysed:
grep "eqtlGenWB" ${tmp_path}/logerror/coloc_susie_eqtlgen*.out | awk -F ":" '{print $1}' | sort -u | wc -l
##Check how many genes per tissue have been analysed for colocalisation:
ls -lthr ${tmp_path}/results/eqtlgen/*all_coloc.rds | grep "eqtlGenWB" | wc -l
ls -lthr ${tmp_path}/results/eqtlgen/*all_susie*.rds | grep "eqtlGenWB" | wc -l
awk 'NR ==1; $11 == "TRUE" {print $0}' /scratch/gen1/nnp5/Var_to_Gen_tmp/results/coloc_asthma_eqtlgen.tsv \
> output/coloc_asthma_eqtlgen.tsv
#no true results fo rcolocsusie in eqtlGen
#Find statistically significant colocalisation results for GTExV8 and eqtlGen eQTL, and add results into var2gene_raw.xlsx:
Rscript ./src/coloc/004_concat_coloc_results.R
###UBCLung eQTL###
mkdir ${tmp_path}/results/ubclung
mkdir ${tmp_path}/ubclung
dos2unix src/coloc_UBClung/*
chmod +x src/coloc_UBClung/*
for c in ${!cs[*]}; do
sbatch --export=CREDSET="${cs[c]}" ./src/coloc_UBClung/000_submit_lookup_lung_eQTL.sh
done
##Get the LD matrix:
##Create the file with gtex-locus pairs:
dos2unix src/coloc_UBClung/002_prepare_LDinput_ubclung.R
chmod +x src/coloc_UBClung/002_prepare_LDinput_ubclung.R
Rscript src/coloc_UBClung/002_prepare_LDinput_ubclung.R "UBCLung"
##Get LD:
#with parameters for UBCLung
sbatch src/coloc/002_get_LD.sh
##to see if errors in the job: grep "Error" /scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/ld-295744.out
##Create UBCLung eQTL regional data from eQTL summary stats:
#mkdir ${tmp_path}/ubclung/eQTL_region_stat/
module load tabix
lung_eQTL="/data/gen1/reference/lung_eQTL"
for c in ${!cs[@]}
do
CREDSET="${cs[c]}"
read chr < <(echo $CREDSET | awk -F "_" '{print $2}')
read pos < <(echo $CREDSET | awk -F "_" '{print $3}')
pos1=`expr $pos - 1000000`
pos2=`expr $pos + 1000000`
if (( pos1 < 0 )); then
pos1=0
fi
tabix -h ${lung_eQTL}/METAANALYSIS_Laval_UBC_Groningen_chr${chr}_formatted.txt.gz ${chr}:${pos1}-${pos2} > ${tmp_path}/ubclung/eQTL_region_stat/${CREDSET}_eQTL_region.txt
done
##Run colocalisation:
dos2unix src/coloc_UBClung/003_submit_coloc_susie_lung_eQTL.sh
dos2unix src/coloc_UBClung/003_run_coloc_susie_lung_eQTL.r
chmod +x src/coloc_UBClung/003_submit_coloc_susie_lung_eQTL.sh
chmod +x src/coloc_UBClung/003_run_coloc_susie_lung_eQTL.r
for c in ${!cs[*]}; do
N=`cat ${tmp_path}ubclung/${cs[c]}_UBCLung_probesets.txt | wc -l`
sbatch --array=1-${N}%20 --export=CREDSET="${cs[c]}" ./src/coloc_UBClung/003_submit_coloc_susie_lung_eQTL.sh
sleep 5
done
######QUALITY CHECKS:
#to find number of genes: 468
wc -l ${tmp_path}/ubclung/SA_*_UBCLung_probesets.txt | sed 's/_/ /g' | sort -k 3,4 -g | awk '{print $1}'
##Check that all genes for each tissue have been analysed:
grep "UBClung" ${tmp_path}/logerror/coloc_susie_UBCLung*.out | awk -F ":" '{print $1}' | sort -u | wc -l
##Check how many genes per tissue have been analysed for colocalisation:
ls -lthr ${tmp_path}/results/ubclung/*all_coloc.rds | grep "ubclung" | wc -l
ls -lthr ${tmp_path}/results/ubclung/*all_susie*.rds | grep "UBCLung" | wc -l
awk 'NR ==1; $13 == "TRUE" {print $0}' ${tmp_path}/results/coloc_asthma_ubclung.tsv \
> output/coloc_asthma_ubclung.tsv
#no TRUE colocsusie resutls for UBCLung
#Find statistically significant colocalisation results for UBCLung, and add results into var2gene_raw.xlsx:
#R gave error for xlsx and Java, so I find statistically significant colocalisation results for GTExV8 and eqtlGen eQTL, and add results into var2gene_raw.xlsx:
Rscript ./src/coloc_UBClung/004_concat_coloc_results_ubclung.R
#Added genes into Var_to_Gene/input/var2genes_raw.xlsx file.
#Merge all the gene from eQTL colocalisation:
Rscript src/merge_genes_eqtl_coloc.R
################
#2.2 APPROXIMATE COLOCALISATION - pQTL (LOOK-UP FOR ALL CREDIBLE SET VARIANTS)
#UKB, SCALLOP, deCODE
################
##Exclude chromosome 6 - no eQTL colocalisation for this locus.
##Genomic boundaries: PIP-max causal variant +/- 1Mb
###UKBIOBANK pQTL LOOK-UP###
#Nick generate the pQTL dataset, as for /data/gen1/UKBiobank/olink/pQTL/README.txt:
# OLINK pQTL analysis
#* 48,195 European samples (as defined in Shrine et al. 2023)
#* 1463 proteins (proteins.txt)
#* Phenotype: untransformed log2 fold protein levels (olink_pheno.txt)
#* Covariates: olink batch, age, sex, genotyping array, PC1-10 (olink_covar.txt)
#* 44.8 million MAC >= 5 variants from HRC+UK10K imputation (/data/ukb/imputed_v3)
#* Additive model with regenie (run_step1.sh & run_step2.sh)
#* Full results for each protein tabixed in results directory
#* Script pqtl_lookup.sh for look ups, run with no arguments for usage info
# Nick created a look up script for UKB pQTL:
#/data/gen1/UKBiobank/olink/pQTL/pqtl_lookup.sh -s <RSID> -r <CHR:START-END> [ -f <PROTEINS FILE> ] [ -p <P THRESHOLD> ] [-h]
#UKB pQTL is in GRCh38, need to find sentinel variants position in GRCh38:
mkdir ${tmp_path}/ukb_pqtl
#input for liftOver:
awk -F ' ' 'NR > 1 {print "chr"$3, $4, $4+1}' $cs_all \
> ${tmp_path}/ukb_pqtl/cs_vars_liftover_input.txt
## download the chain file b37 to b38
wget -P /home/n/nnp5/software/liftover https://hgdownload.soe.ucsc.edu/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz
/home/n/nnp5/software/liftover/liftOver \
${tmp_path}/ukb_pqtl/cs_vars_liftover_input.txt \
/home/n/nnp5/software/liftover/hg19ToHg38.over.chain.gz \
${tmp_path}/ukb_pqtl/cs_vars_liftover_output.bed \
${tmp_path}/ukb_pqtl/cs_vars_liftover_unlifted.bed
##divide cs vars into the respective credset with b38 location:
dos2unix src/pQTL_coloc/000_preprocess_cs_b38.R
chmod +x src/pQTL_coloc/000_preprocess_cs_b38.R
Rscript src/pQTL_coloc/000_preprocess_cs_b38.R \
$cs_all \
$tmp_path/ukb_pqtl/ \
${tmp_path}/ukb_pqtl/cs_vars_liftover_output.bed
#pvalue theshold based on bonferroni correction by the number of measured proteins:
dos2unix src/pQTL_coloc/000_submit_lookup_ukbpqtl.sh
chmod +x src/pQTL_coloc/000_submit_lookup_ukbpqtl.sh
sbatch --array 0-16 src/pQTL_coloc/000_submit_lookup_ukbpqtl.sh
#check that all credible sets variants have been analysed:
find ${tmp_path}/ukb_pqtl/*rs*/ -type f | wc -l
#combine look-up ukb-pqtl files with Nick's script (from /data/gen1/UKBiobank/olink/pQTL/orion_pain):
cd ${tmp_path}/ukb_pqtl/
/home/n/nnp5/PhD/PhD_project/Var_to_Gene/src/pQTL_coloc/001_combine_pqtl.awk *rs*/* \
> ${tmp_path}/ukb_pqtl/lookup_ukbpqtl.txt
cp ${tmp_path}/ukb_pqtl/lookup_ukbpqtl.txt output/
cd /home/n/nnp5/PhD/PhD_project/Var_to_Gene/ #of wherever the folder 'Var_to_Gene' is located
#extract gene showing look-up results:
awk 'NR > 1 {print $2}' ${tmp_path}/ukb_pqtl/lookup_ukbpqtl.txt | sort -u \
> input/ukbpqtl_var2genes_raw
###deCODE pQTL LOOK-UP###
#No look-up results for deCode
#Found this script of Nick for decode lookup (from /data/gen1/TSH/coloc_susie/lookup_decode.awk):
#create credible_set.snps: create credible_set.snps in alphabetical order
#($5 < $6 ? $5 : $6)"_"($6 > $5 ? $6 : $5)
#locus85 10_101220474_A_G
#locus85 10_101221275_C_T
mkdir ${tmp_path}/decode_pqtl
#From Chiara/Kayesha script and Nick:
sbatch src/pQTL_coloc/000_submit_lookup_decode.sh
#filter out gene name with significant pQTL:
awk 'NR > 1 && $5 > 0 {print $1}' ${tmp_path}/decode_pqtl/log_pQTL_decode_analysis | sed 's/.txt//g' \
> input/decode_pqtl_var2genes_raw
###SCALLOP pQTL LOOK-UP###
#From Chiara's script R:\TobinGroup\GWAtraits\Chiara\pQTL_SCALLOP and Nick's script scallop_lookup.awk
mkidr ${tmp_path}/scallop_pqtl
#From Chiara and Nick script:
sbatch src/pQTL_coloc/000_submit_lookup_scallop.sh
#filter out gene name with significant pQTL:
awk 'NR > 1 && $5 > 0 {print $1}' ${tmp_path}/scallop_pqtl/log_pQTL_SCALLOP_analysis | sed 's/.txt//g' \
> input/scallop_pqtl_var2genes_raw
##Chrom Pos MarkerName Allele1 Allele2 Freq1 FreqSE Effect StdErr P-value Direction TotalSampleSize Gene
awk -F "\t" '$10 < 5e-8 {print $0, $13="CA-125"}' ${tmp_path}/scallop_pqtl/CA-125.txt \
> output/scallop_ukbpqtl.txt
awk -F "\t" '$10 < 5e-8 {print $0, $13="ST2"}' ${tmp_path}/scallop_pqtl/ST2.txt \
>> output/scallop_ukbpqtl.txt
#Merge genes from the different pQTL look-up analyses:
cat input/ukbpqtl_var2genes_raw input/scallop_pqtl_var2genes_raw input/decode_pqtl_var2genes_raw \
| awk '{print $2="pQTL", $1}' > input/pqtl_lookup_genes_merged
################
#3 POLYGENIC PRIORITY SCORE (PoPS)
################
#https://github.com/FinucaneLab/pops
#Looking at the github repo and to Jing's code, I compiled PoPS in PoPS.sh and submit_pops.sh:
#PoPS.sh internally calls submit_pops.sh to be submitted as a job - it requires lots of time.
mkdir ${tmp_path}/pops
mkdir ${tmp_path}/pops/results
bash PoPS.sh
#Look at the results and find top score genes within a +/-250Kb from highest-PIP variant for each locus -
#if no top genes in +/- 250Kb, enlarge the window to +/-500Kb:
Rscript src/PoPS/PoPS_summary.R
#Added /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/pops_var2genes_raw.txt to the var2genes_raw.xlsx.
################
#3 NEARBY HUMAN ORTHOLOG MOUSE KO GENE
################
#From Jing's code as well as my own input:
#Download genotype-phenotype data from IMPC (International Mouse Phenotyping consortium)
#https://www.mousephenotype.org/data/release
mkdir ${tmp_path}/mouse_ko
#latest release 2023-07-06:
wget -P ${tmp_path}/mouse_ko/ https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/latest/results/genotype-phenotype-assertions-ALL.csv.gz
#latest release 2023-11-22:
wget -P ${tmp_path}/mouse_ko/ http://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_mouse_hcop_fifteen_column.txt.gz
#run the analysis:
Rscript src/mouse_ko/mouse_ko.r > ${tmp_path}/mouse_ko/output_mouse_ko
#Upload /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/mouse_ko_genes_raw.txt genes into var2genes_raw.xlsx.
cp ${tmp_path}/mouse_ko/results_500kb.csv input/mko_results_500kb.csv
################
#4 NEARBY RARE MENDELIAN DISEASE GENE
################
#From Jing's code as well as my own input:
#Download genotype-phenotype data from https://www.orphadata.com/genes/:
mkdir ${tmp_path}/rare_disease/
#latest release 2023-06-22:
wget -P ${tmp_path}/rare_disease/ https://www.orphadata.com/data/xml/en_product6.xml
#downloaded locally and converted in xlsx in Excel - uploaded in /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/en_product6.xlsx
#dowload description of the file:
wget -P src/report/ https://www.orphadata.com/docs/OrphadataFreeAccessProductsDescription.pdf
#Downloaded hpo: human phenotype ontology provides a standardized vocabulary of phenotypic abnormalities encounterd in human disease
#latest release June 2023:
#Downloaded locally and converted in xlsx in Excel - uploaded in /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/en_product4.xlsx
#https://github.com/Orphanet/Orphadata_aggregated/blob/master/Rare%20diseases%20with%20associated%20phenotypes/en_product4.xml
#run the analysis:
Rscript src/rare_disease/rare_disease.r > ${tmp_path}/rare_disease/output_rare_disease
#Upload /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/rare_disease_genes_raw.txt genes into var2genes_raw.xlsx.
cp ${tmp_path}/rare_disease/results_500kb_by_gene.csv input/raredis_results_500kb_by_gene.csv
################
#5 RARE VARIANT UKBiobank ANALYSIS in RAP
################
#Prep file ALICE3:
##Match UK Biobank application IDs:
mkdir ${tmp_path}/rare_variant/
Rscript src/rare_variant/000_dataprep_rarevar.R
#Copy the pheno covatiare file in /rfs/:
cp ${tmp_path}/rare_variant/demo_EUR_pheno_cov_broadasthma_app88144.txt /rfs/TobinGroup/data/UKBiobank/application_88144/
#Gene-collapsing rare variant analysis:
#https://github.com/legenepi/rare_collapsing
#Single rare variant analysis: NB - need to do it for rare variant ExWAS ! change filter in plink!
#src/rare_variant/submit_rare_variant.sh
################
#6 TABLES FOR EACH ANALYSIS AND MERGE GENES FOR GENE PRIORITISATION AND VISUALISATION
################
#TO NOTE: ONLY FOR
#nearest gene
#functional annotation
#eQTL
#mouse_KO
#PoPS
#pQTL
#rare_disease
#SINGLE AND GENE-BASED COLLAPSING ANALYSIS: no genes for variant-to-gene mapping, so I did not add them to this table
Rscript src/Locus_to_genes_table.R
Rscript src/genes_heatmap.R
cp output/V2G_heatmap_subplots.png /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/
cp src/report/var2gene_full.xlsx /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/
################
#7 REGION PLOTS WITH VAR2GENE RESULTS
################
#bash Region_plot_V2G.sh
#Region_plot_V2G.R
#Region_plot_V2G_2.R
#######ADDITIONAL - COMPARISON WITH VALETTE ET AL. 2021 RESULTS#############
#how many evidence for each gene?
awk '$3 == 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq -c | sort -k1 -r
awk '$3 == 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq -c | awk '$1 != 1 {print $2}' \
> ${tmp_path}/gene_2plus_evidence
#Are these genes with 2+ evidence already described/found in asthma studies ?
##Valette et al. 2021:
wget -F ${tmp_path}/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8187656/bin/42003_2021_2227_MOESM4_ESM.xlsx
#SD17: "Supplementary Data 17. Druggability of the 806 target genes identified in this study.
#Put all the 806 gene identified by Valette et al. 2021 into this file:
nano ${tmp_path}/all_genes_Valette2021
grep -F -f ${tmp_path}/gene_2plus_evidence ${tmp_path}/all_genes_Valette2021 | wc -l
#All genes are identified by Valette et al.2021
#59 genes, among which all the 2+ evidence were found by Valette et al. as well:
awk 'NR > 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq | grep -w -F -f - ${tmp_path}/all_genes_Valette2021 | wc -l
grep -w -F -f ${tmp_path}/gene_2plus_evidence ${tmp_path}/all_genes_Valette2021 | wc -l
#34 NOT IDENTIFIED BY VALETTE (but all with just one level of evidence in my analysis):
awk 'NR > 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq | \
grep -v -w -F -f ${tmp_path}/all_genes_Valette2021 - | sort | sed -z 's/\n/,/g;s/,$/\n/'
#In green are 29 target genes that overlapped among lung TWAS genes, blood eGenes, and chromatin contact genes."
#Put the 29 genes into this file:
#9 genes in the 29 with three level of evidence: 4 in the 2+ my level of evidence, 5 just one evidence.
nano ${tmp_path}/target_genes_Valette2021
awk 'NR > 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq | grep -w -F -f - ${tmp_path}/target_genes_Valette2021
grep -F -f ${tmp_path}/gene_2plus_evidence ${tmp_path}/target_genes_Valette2021
#the four with two+ level of evidence:
#CAMK4
#IKZF3
#IL4R
#SMAD3
#The five with one level of evidence:
#GSDMB
#HLA-DQB1
#MED1
#RAD50
#TAP2
#Supplementary Data 18. PheWAS for the 40 genes prioritized as therapeutic targets for asthma.
#aOverall association score for asthma from the Open Targets Platform (ref21).
#bDGIdb, Drug-gene interaction database (ref22).
#cDruggable genome (ref23).
#dAsthma drug targets derived from El-Husseini et al. (ref20)
#Put the 40 genes into this file:
#9 in total: 4 in the 2+ my level of evidence, 5 just one evidence.
nano ${tmp_path}/druggable_genes_Valette2021
awk 'NR > 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq | grep -w -F -f - ${tmp_path}/druggable_genes_Valette2021
grep -F -f ${tmp_path}/gene_2plus_evidence ${tmp_path}/druggable_genes_Valette2021
#the four with two+ level of evidence:
#IL1RL1
#RORA
#SMAD3
#IL4R
#The five with one level of evidence:
#IL18RAP
#CCR4
#IL13
#IL33
#ERBB3
#Targets of existing asthma drugs are in bold - taken from Table 4 of Valette et al 2021.
#CCR4, IL13,IL2RA,IL4R,IL5,IL6,SMAD3,TNFSF4,TSLP
nano ${tmp_path}/existing_asthma_drugtarget
#Two genes in the two+ level of evidence among the gene already targeted for asthma:
#IL4R,SMAD3
awk 'NR > 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq | grep -w -F -f - ${tmp_path}/existing_asthma_drugtarget
#CCR4
#IL13
#IL4R
#SMAD3
grep -F -f ${tmp_path}/gene_2plus_evidence ${tmp_path}/existing_asthma_drugtarget
#IL4R
#SMAD3
I used several methods, mainly following the previous study of
our group, Shrine et al. 2023 (https://www.nature.com/articles/s41588-023-01314-0).
Code
src/Variant_annotation_FAVOR.R
#!/usr/bin/env Rscript
#Rationale: Variant Annotation with FAVOR
#Run as:
#Rscript src/ \
sink(stderr())
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
suppressMessages(library(corrplot))
#suppressMessages(library(xlsx))
#suppressMessages(library(qdap))
suppressMessages(library(VennDiagram))
library(RColorBrewer)
#args <- commandArgs(T)
#favor_file <- args[1]
favor_file <- "/alice-home/3/n/nnp5/PhD/PhD_project/Var_to_Gene/input/FAVOR_credset_chrpos38_2023_08_08.csv"
favor <- fread(favor_file)
favor.digest <- favor %>% select("Variant (VCF)", "Chromosome", "Position", "rsID", "Genecode Comprehensive Category",
"Genecode Comprehensive Info", "Genecode Comprehensive Exonic Category", "Genecode Comprehensive Exonic Info",
"CAGE Promoter", "CAGE Enhancer", "GeneHancer", "SuperEnhancer", "CADD phred", "Fathmm XF",
"aPC-Protein-Function", "aPC-Conservation", "aPC-Epigenetics-Active", "aPC-Epigenetics-Repressed",
"aPC-Epigenetics-Transcription", "aPC-Local-Nucleotide-Diversity", "aPC-Mutation-Density",
"aPC-Transcription-Factor", "aPC-Mappability", "Clinical Significance",
"Clinical Significance (genotype includes)", "Disease Name",
"Disease Name (included variant)", "Review Status", "Allele Origin",
"Disease Database ID", "Disease Database ID (included variant)", "Gene Reported")
colnames(favor.digest) <- make.names(colnames(favor.digest), unique=TRUE)
#FAVOR - Nearest Gene to variants with highest PIP in the locus:
#Identify whether variants cause protein coding changes using Gencode genes definition systems
#it will label the gene name of the variants has impact, if it is intergenic region
#the nearby gene name will be labeled in the annotation.
##Retrieve Sentinel SNPs - highest PIP in the fine-mapped loci:
#cp /scratch/gen1/nnp5/Var_to_Gen_tmp/ukb_pqtl/cs_vars_liftover_output.bed /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/
sig_list_tmp <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt")
setnames(sig_list_tmp,"chromosome","chr")
setnames(sig_list_tmp,"position","posb37")
sig_list_tmp$sentinel <- paste0(sig_list_tmp$chr,"_",sig_list_tmp$posb37,"_",sig_list_tmp$allele1,"_",sig_list_tmp$allele2)
locus <- unique(sig_list_tmp$locus)
sentinels <- data.frame(matrix(ncol = 4,nrow = 0))
colnames(sentinels) <- c("locus","sentinel","chr","posb37")
for(i in locus){
locus_sig_list <- sig_list_tmp %>% filter(locus == as.character(i))
locus_sig_list <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(locus,sentinel,chr,posb37)
sentinels <- rbind(sentinels,locus_sig_list)
}
#liftover fo rhte senitnel to find b38:
posb38 <- fread("input/highestPIP_sentinels_hglft_genome_b38.bed") %>% rename(posb38=V2)
posb38 <- posb38 %>% separate(V4, c("chr", "pos2", "posb37"))
posb38 <- posb38 %>% select(chr, posb37, posb38)
posb38$chr <- as.numeric(gsub("chr","",posb38$chr))
posb38$posb37 <- as.numeric(posb38$posb37)
sentinel_b38 <- sentinels %>% left_join(posb38, by =c("chr", "posb37")) %>% rename(Chromosome=chr, Position=posb38)
###Nearest genes:
favor.digest.NG <- favor.digest %>% select(Chromosome, Position, "Genecode.Comprehensive.Info") %>% rename(Nearest_gene="Genecode.Comprehensive.Info")
sentinel_b38_NG <- sentinel_b38 %>% left_join(favor.digest.NG, by = c("Chromosome", "Position")) %>% rename(posb38=Position)
#Take closet genes when multiples are annotated:
#1:AP001189.5
#5:TSLP
#8:IL33
#9:IL1RL1
#16:HLA-DQA1
#17:AC044784.1
sentinel_b38_NG$Nearest_gene[1] <- "AP001189.5"
sentinel_b38_NG$Nearest_gene[5] <- "TSLP"
sentinel_b38_NG$Nearest_gene[8] <- "IL33"
sentinel_b38_NG$Nearest_gene[9] <- "IL1RL1"
sentinel_b38_NG$Nearest_gene[16] <- "HLA-DQA1"
sentinel_b38_NG$Nearest_gene[17] <- "AC044784.1"
fwrite(sentinel_b38_NG,"output/PIP_sentinels_nearestgenes",sep="\t",quote=F)
nearest_gene <- unique(unlist(strsplit(sentinel_b38_NG$Nearest_gene,",")))
#Categorical annotation:
##Exploratory:
summary(as.factor(favor.digest$"Genecode.Comprehensive.Category"))
summary(as.factor(favor.digest$"Genecode.Comprehensive.Info"))
##Info for the 5 coding variants:
summary(as.factor(favor.digest$"Genecode.Comprehensive.Exonic.Category"))
summary(as.factor(favor.digest$"Genecode.Comprehensive.Exonic.Info"))
#FAVOR functional:
#FANTOM5 CAGE promoter or enhancer, Functional Integrative score > 15, ClinVar
#CAGE promoter or enhancer (from FANTOM5):
fantom5 <- favor.digest %>% filter(!is.na(CAGE.Promoter) | !is.na(CAGE.Enhancer))
fantom5_genes <- unique(fantom5$Genecode.Comprehensive.Info)
fantom5_df <- fantom5 %>% select(Chromosome, Position, "Genecode.Comprehensive.Info", CAGE.Promoter, CAGE.Enhancer) %>% rename(Nearest_gene="Genecode.Comprehensive.Info")
fwrite(fantom5_df,"output/fnc_annot_fantom5",sep="\t",quote=F,row.names=F)
#Functional integrative score:
#Among 562 (53 removed because NAs) there are a correlations between aPC.Epigenetics.Active and aPC.Transcription.Factor (0.75), between aPC.Mutation.Density
#and aPC.Local.Nucleotide.Diversity (0.80); between CADD.phred and aPCs.Conservation (0.83).
favor.aPCs <- favor.digest %>% select("CADD.phred","aPC.Protein.Function", "aPC.Conservation","aPC.Epigenetics.Active",
"aPC.Epigenetics.Repressed","aPC.Epigenetics.Transcription","aPC.Local.Nucleotide.Diversity",
"aPC.Mutation.Density", "aPC.Transcription.Factor", "aPC.Mappability")
favor.aPCs <- na.omit(favor.aPCs)
M <- cor(favor.aPCs)
head(round(M,3))
corrplot(M, method="number")
##Integrative score > 15:
inscores <- favor.digest %>% filter(CADD.phred > 15 | aPC.Protein.Function > 15 | aPC.Conservation > 15 | aPC.Epigenetics.Active > 15 | aPC.Epigenetics.Repressed > 15 |
aPC.Epigenetics.Transcription > 15 | aPC.Local.Nucleotide.Diversity > 15 | aPC.Mutation.Density > 15 | aPC.Transcription.Factor > 15 | aPC.Mappability > 15)
inscores_genes <- unique(inscores$Genecode.Comprehensive.Info)
#Among 99 integrative score > 15 variants, these correlations change a little: between aPC.Epigenetics.Active and aPC.Transcription.Factor (0.69), between aPC.Mutation.Density
#and aPC.Local.Nucleotide.Diversity (0.63); between CADD.phred and aPCs.Conservation (0.91).
inscores.aPCs <- inscores %>% select("CADD.phred","aPC.Protein.Function", "aPC.Conservation","aPC.Epigenetics.Active",
"aPC.Epigenetics.Repressed","aPC.Epigenetics.Transcription","aPC.Local.Nucleotide.Diversity",
"aPC.Mutation.Density", "aPC.Transcription.Factor", "aPC.Mappability")
M2 <- cor(inscores.aPCs)
head(round(M,3))
#Save the corplot x the report:
png(file = "/home/n/nnp5/PhD/PhD_project/Var_to_Gene/output/corrplot_integrative_score_15.png")
corplot_integrativescore <- corrplot(M2, method="number")
dev.off()
inscores.aPCs_df <- inscores %>% select(Chromosome, Position, Genecode.Comprehensive.Info, "CADD.phred","aPC.Protein.Function", "aPC.Conservation","aPC.Epigenetics.Active",
"aPC.Epigenetics.Repressed","aPC.Epigenetics.Transcription","aPC.Local.Nucleotide.Diversity",
"aPC.Mutation.Density", "aPC.Transcription.Factor", "aPC.Mappability")
fwrite(inscores.aPCs_df,"output/fnc_annot_inscores",sep="\t",quote=F,row.names=F)
#Clinical annotation:
clinvar <- favor.digest %>% filter(!is.na(Clinical.Significance))
clin_genes <- unique(clinvar$Gene.Reported)
clinvar_df <- clinvar %>% select(Chromosome, Position, Clinical.Significance, Gene.Reported)
fwrite(clinvar_df,"output/fnc_annot_clinvar",sep="\t",quote=F,row.names=F)
#Genes from Variant Annotation:
#fantom5_genes, inscores_genes, clin_genes
##Polish the genes (remove GeneID, remove ensembl, remove where mutation occurs, keep only gene name)
#fantom_genes: everything inside parenthesis needs to be deleted, and divide genes listed together
fantom5_genes <- bracketX(fantom5_genes) %>% unique()
fantom5_genes <- unlist(strsplit(fantom5_genes, "\\s*,\\s*"))
#inscores_genes: everything inside parenthesis needs to be deleted, and divide genes listed together
inscores_genes <- bracketX(inscores_genes) %>% unique()
inscores_genes <- unlist(strsplit(inscores_genes, "\\s*,\\s*"))
inscores_genes <- unlist(strsplit(inscores_genes, ';', fixed=TRUE))
#clin_genes: separate element if '|' present, and then remove GeneID-after the ':'
clin_genes <- unlist(strsplit(clin_genes, '|', fixed=TRUE))
clin_genes <- gsub(":.*", "", clin_genes) %>% unique()
#Save the genes:
fwrite(as.data.frame(nearest_gene),"input/nearest_genes_raw",row.names=FALSE, col.names=FALSE,quote=F)
write.xlsx(varannot_genes,"/alice-home/3/n/nnp5/PhD/PhD_project/Var_to_Gene/input/var2genes_raw.xlsx",sheetName = "varannot_genes", row.names=FALSE, col.names=FALSE)
Variant annotation can be represented in qualitative terms, such as
variant effect predictor category or in quantitative terms with a score
including protein function, conservation, epigenetics, integrative
function (ref https://www.nature.com/articles/s41588-020-0676-4). When
dealing with quantitative measures for the same functional
category,these can be summed up using principal components, as
implemented for the first time in the STAAR framework (ref https://www.nature.com/articles/s41588-020-0676-4) and
adopted in Functional Annotation of Variants Online Resources (FAVOR) as
well. Annotation Principal Component (aPC) is the first PC among
standardised individual score for a specific functional category.
AnnotationPC is transformed in PHRED-scale score, as CADD. CADD authors
suggest 15 as a good threshold for functionally relevant variants (https://cadd.gs.washington.edu/info).
FAVOR
aggregates different sources and categories of annotation. I decided to
focus on:
Nearest gene for each credible set top PIP variant (“Genecode.Comprehensive.Info”);
Functional annotation for all 615 credible set variants looking at either:
2.1. FANTOM5 Gene Enhancer and/or Gene Promoter (“CAGE.Promoter”, “CAGE.Enhancer”)
2.2. Integrative scores > 15 (“CADD.phred”,“aPC.Protein.Function”,“aPC.Conservation”,“aPC.Epigenetics.Active”, “aPC.Epigenetics.Repressed”,“aPC.Epigenetics.Transcription”,“aPC.Local.Nucleotide.Diversity”,“aPC.Mutation.Density”, “aPC.Transcription.Factor”, “aPC.Mappability”);
2.3. Clinical annotation from ClinVar (“Clinical.Significance”, “Gene.Reported”)
Code
src/coloc/000_preprocess_cs.R
#!/usr/bin/env Rscript
#Rationale: Create separate credible sets file for each genomic locus.
args = commandArgs(trailingOnly = TRUE)
cred_set = args[1]
tmp_path = args[2]
library(tidyverse)
library(data.table)
cs <- fread(cred_set)
#filter out the locus on the MHC region
pheno_tmp <- cs %>% filter(locus != "6_rs9271365_32086794_33086794")
pheno <- unique(pheno_tmp$locus)
#split credset df into separate file for each locus:
##NB: allele2 is the effect allele, I change them because in /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat a1 is the effect allele, a2 is the non effect allele
for (i in pheno){
cs_tmp <- cs %>% filter(locus == as.character(i))
chr <- cs_tmp %>% filter(PIP_average == max(cs_tmp$PIP_average)) %>% select(chromosome)
location <- cs_tmp %>% filter(PIP_average == max(cs_tmp$PIP_average)) %>% select(position)
allele1 <- cs_tmp %>% filter(PIP_average == max(cs_tmp$PIP_average)) %>% select(allele2)
allele2 <- cs_tmp %>% filter(PIP_average == max(cs_tmp$PIP_average)) %>% select(allele1)
fwrite(cs_tmp, paste0(tmp_path,"SA_",chr,"_",location,"_",allele1, "_",allele2), quote=F, sep="\t")}
src/coloc/000A_submit_eqtl_gtex_extraction.sh
#!/bin/bash
#==================================================================
# File: 000A_submit_eqtl_gtex_extraction.sh
# Project: SevereAsthma
# Author: NNP
# Date: 31 October 2023
# Rationale: Submit .
# Submission: sbatch --array=1-22 src/coloc/000A_submit_eqtl_gtex_extraction.sh
#==================================================================
#SBATCH --job-name=eqtl_gtex_extraction
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=3:0:0
#SBATCH --mem=100gb
#SBATCH --account=gen1
#SBATCH --export=NONE
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
#SBATCH --array=1-22 # array ranks to run
i=$SLURM_ARRAY_TASK_ID
module load R
Rscript src/coloc/000A_eqtl_gtex_extraction.R ${i}
src/coloc/000A_eqtl_gtex_extraction.R
#!/usr/bin/env Rscript
#==================================================================
# File: 000A_eqtl_gtex_extraction.R
# Project: SevereAsthma
# Author: NNP - edited from KC
# Date: 31 October 2023
# Rationale: Load eQTL data for relevant tissues and and extract
# chromosome and positions for liftOver.
#==================================================================
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
#Sys.setenv(LIBARROW_BINARY = TRUE); install.packages('arrow', type = "source")
suppressMessages(library(arrow))
options(scipen=999)
args <- commandArgs(T)
chr <- args[1]
setwd("/data/gen1/reference/GTEx_Analysis_v8_QTLs/GTEx_Analysis_v8_EUR_eQTL_all_associations")
# Identify files ----
file.names = list.files(path = ".", pattern = paste0("*.v8.EUR.allpairs.chr", chr, ".parquet")) %>%
#Kayesha:
#str_subset(pattern = "Adrenal|Artery|Blood|Brain|Esophagus|Heart|Ileum|Liver|Lung|Muscle|Nerve|Pituitary|Spleen|Stomach")
str_subset(pattern = "Skin|Colon")
for (file in file.names) {
# Read in data ----
name = str_remove(file, ".parquet")
cat(paste0("Tissue: ", name, "\n"))
cat("Loading data...\n")
df <- read_parquet(file)
# Separate variant_id column into chromosome, position, reference allele and alternative allele ----
cat("Wrangling data...\n")
sep <- df %>%
separate(variant_id, c("CHROM", "POS", "REF", "ALT")) %>%
mutate(CHROM = str_replace(CHROM, "chr", ""),
CHROM = str_replace(CHROM, "X", "23"),
CHROM = as.numeric(CHROM),
POS = as.numeric(POS),
ID = paste0(CHROM, "_", POS, "_", pmin(REF, ALT), "_", pmax(REF, ALT)))
## Write file ----
cat(paste0("Writing file: ", name, ".hg38.txt.gz", "\n"))
fwrite(sep, file = paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/liftover_gtexv8/", name, ".hg38.txt.gz"), quote = F, sep = "\t", na = "NA", row.names = F, col.names = T, compress = "auto")
# Generate BED file for liftOver ----
cat(paste0("Creating bed file: ", name, ".hg38.bed", "\n"))
sep %>%
select(chrom = CHROM, chromStart = POS, ID) %>%
mutate(chromEnd = as.integer(chromStart+1),
chrom = paste0("chr", chrom),
chrom = str_replace(chrom, "chr23", "chrX")) %>%
select(chrom, chromStart, chromEnd, ID) %>%
fwrite(file = paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/liftover_gtexv8/bed/", name, ".hg38.bed"), quote = F, sep = "\t", na = "NA", row.names = F, col.names = F)
cat("Done.\n\n")
}
src/coloc/000B_eqtl_gtex_liftover.sh
#!/bin/bash
#==================================================================
# File: 000B_eqtl_gtex_liftover.sh
# Project: SevereAsthma
# Author: NNP- edited from KC
# Date: 31 October 2023
# Rationale: Liftover eQTL coordinates from GRCh37 to GRCh38.
# Submission: qsub 02B_eqtl_gtex_liftover.sh
#==================================================================
#SBATCH --job-name=eqtl_gtex_liftover
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=23:0:0
#SBATCH --mem=100gb
#SBATCH --account=gen1
#SBATCH --export=NONE
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
FILE_PATH="/scratch/gen1/nnp5/Var_to_Gen_tmp/liftover_gtexv8/bed"
liftOver="/home/n/nnp5/software/liftOver"
chain="/data/gen1/reference/liftOver/hg38ToHg19.over.chain"
cd ${FILE_PATH}
for filename in *.hg38.bed
do
name=$(basename ${filename} .hg38.bed)
${liftOver} \
${filename} \
${chain} \
${name}.hg19.bed \
${name}.unlifted.bed
done
src/coloc/000C_submit_eqtl_gtex_conversion.sh
#!/bin/bash
#==================================================================
# File: 000A_submit_eqtl_gtex_conversion.sh
# Project: SevereAsthma
# Author: NNP
# Date: 31 October 2023
# Rationale: Submit .
# Submission: sbatch --array=1-22 src/coloc/000C_submit_eqtl_gtex_conversion.sh
#==================================================================
#SBATCH --job-name=eqtl_gtex_extraction
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=3:0:0
#SBATCH --mem=100gb
#SBATCH --account=gen1
#SBATCH --export=NONE
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
#SBATCH --array=1-22 # array ranks to run
i=$SLURM_ARRAY_TASK_ID
module load R
Rscript src/coloc/000C_eqtl_gtex_conversion.R ${i}
src/coloc/000C_eqtl_gtex_conversion.R
#!/usr/bin/env Rscript
#==================================================================
# File: 000C_eqtl_gtex_conversion.R
# Project: SevereAsthma
# Author: NNP-edited from KC
# Date: 31 October 2023
# Ratinale: Update coordinates in original GTeX files to GRCh37.
#==================================================================
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
options(scipen=999)
args <- commandArgs(T)
chr <- args[1]
setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/liftover_gtexv8")
# Identify files ----
file.names = list.files(path = ".", pattern = paste0("*.v8.EUR.allpairs.chr", chr, ".hg38.txt.gz")) %>%
#Kayesha
#str_subset(pattern = "Adrenal|Artery|Blood|Brain|Esophagus|Heart|Ileum|Liver|Lung|Muscle|Nerve|Pituitary|Spleen|Stomach")
str_subset(pattern = "Colon|Skin")
for (file in file.names) {
# Read in data ----
name = str_remove(file, ".hg38.txt.gz")
cat(paste0("Tissue: ", name, "\n"))
## GTeX
hg38 <- as_tibble(fread(file))
## Liftover file (hg19)
liftover <- as_tibble(fread(paste0("bed/", name, ".hg19.bed"), header = FALSE))
# Remove chromosomes from liftover bed (hg19) which are alternative contigs, and filter for only chromosome of GTeX data ----
## As multiple duplicate variants are present, remove these
cat("Wrangling liftOver data...\n")
if (chr == "X") {
liftover <- liftover %>%
select(CHROM = V1, GENPOS = V2, ID = V4) %>%
mutate(CHROM = str_replace(CHROM, "chr", ""),
CHROM = str_replace(CHROM, "X", "23")) %>%
filter(CHROM == 23) %>%
mutate(CHROM = as.integer(CHROM)) %>%
distinct(ID, .keep_all = TRUE)
#arrange(CHROM, GENPOS)
} else {
liftover <- liftover %>%
select(CHROM = V1, GENPOS = V2, ID = V4) %>%
mutate(CHROM = str_replace(CHROM, "chr", ""),
CHROM = str_replace(CHROM, "X", "23")) %>%
filter(CHROM == chr) %>%
mutate(CHROM = as.integer(CHROM)) %>%
distinct(ID, .keep_all = TRUE)
}
# Perform mapping ----
cat("Performing mapping...\n")
hg19 <- hg38 %>%
right_join(liftover, by = "ID") %>%
select(-CHROM.x, -POS) %>%
rename(CHROM = CHROM.y, POS = GENPOS) %>%
mutate(ID = paste0(CHROM, "_", POS, "_", pmin(REF, ALT), "_", pmax(REF, ALT))) %>%
relocate(c(CHROM, POS, ID), .after = phenotype_id) %>%
select(-tss_distance) # Will no longer apply to new coordinates
# Write file ----
cat(paste0("Writing file: ", name, ".hg19.txt.gz", "\n"))
fwrite(hg19, paste0(name, ".hg19.txt.gz"), quote = F, sep = "\t", na = "NA", row.names = F, col.names = T, compress = "auto")
cat("Done.\n\n")
}
src/coloc/001_submit_GWASpairs.sh
#!/bin/bash
#SBATCH --job-name=SA_GWASpairs
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=0:30:0
#SBATCH --mem=16gb
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
#SBATCH --account=gen1
#SBATCH --export=NONE
module load R
Rscript src/coloc/001_run_GWASpairs.R $CREDSET
src/coloc/001_run_GWASpairs.R
library(data.table)
library(tidyverse)
args = commandArgs(trailingOnly = TRUE)
cred_set = args[1]
tmp = unlist(strsplit(cred_set, split="_"))
chr = as.numeric(tmp[2])
location = as.numeric(tmp[3])
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"
gwas <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat") %>%
filter(b37chr==chr, between(bp, location-5e5, location+5e5))
write_delim(x=gwas, file=paste0(tmp_path, cred_set, "_GWASpairs.txt.gz"))
src/coloc/002_prepare_LDinput.R
#!/usr/bin/env Rscript
#Rationale: combine names in a document to create LD .raw file
args = commandArgs(trailingOnly = TRUE)
tissue = args[1]
#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"
library(tidyverse)
df <- data.frame(cred_set = c("SA_8_81292599_C_A","SA_6_90963614_AT_A","SA_5_110401872_C_T",
"SA_2_242692858_T_C","SA_15_67442596_T_C","SA_12_56435504_C_G","SA_11_76296671_G_A",
"SA_9_6209697_G_A","SA_5_131885240_C_G","SA_3_33042712_C_T",
"SA_2_102913642_AAAAC_A","SA_17_38168828_G_A","SA_16_27359021_C_A","SA_15_61068954_C_T",
"SA_12_48202941_T_C","SA_10_9064716_C_T")) %>%
as_tibble %>%
separate(cred_set, c("pheno", "chr", "pos", "a1", "a2"), sep="_")
df$ancestry <- "EUR"
df <- df %>%
unite("snp", chr:a2, remove=FALSE) %>%
mutate(eQTL = tissue,
start = as.numeric(pos) - 5e5,
end = as.numeric(pos) + 5e5) %>%
select(pheno, ancestry, snp, chr, pos, eQTL, start, end)
write_tsv(x=df, file=paste0(tmp_path,"gtex_Pairs_lookup.txt"),col_names=F, append=T)
src/coloc/002_get_LD.sh
#!/bin/bash
#SBATCH --job-name=ld
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=1:0:0
#SBATCH --mem=24gb
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
#SBATCH --account=gen1
#SBATCH --export=NONE
module load plink/1.9-beta6.27-vhw5dr2
module load R
#For GTExv8:
#pairs_lookup_file="/scratch/gen1/nnp5/Var_to_Gen_tmp/gtex_Pairs_lookup.txt"
#DIR="/scratch/gen1/nnp5/Var_to_Gen_tmp"
#For eqtlGen:
#pairs_lookup_file="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen/eqtlGenWB_Pairs_lookup.txt"
#DIR="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen"
#For UBCLung:
pairs_lookup_file="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung/UBCLung_Pairs_lookup.txt"
DIR="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung"
cd ${DIR}
while IFS='' read -r LINE || [ -n "${LINE}" ]
do
chr=`echo ${LINE} | awk '{print $4}'`
pheno=`echo ${LINE} | awk '{print $1}'`
signal=`echo ${LINE} | awk '{print $3}'`
eQTL=`echo ${LINE} | awk '{print $6}'`
start=`echo ${LINE} | awk '{print $7}'`
end=`echo ${LINE} | awk '{print $8}'`
anc=`echo ${LINE} | awk '{print $2}'`
name="${DIR}/${signal}_${pheno}_${eQTL}"
plink_script="${name}_plink.sh"
if [ -f "$plink_script" ]; then
echo "PLINK script exists, now deleted and created a new one"
rm $plink_script
fi
if [ ${anc}=="EUR" ]; then
ref_ld="/data/gen1/LF_HRC_transethnic/signal_selection/LDpanel/EUR_UKB/ukb_imp_chr${chr}_EUR_selected"
else
ref_ld="/data/gen1/LF_HRC_transethnic/signal_selection/LDpanel/AFR_1000G/ALL.chr${chr}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes_Amin_Amax_noesv_AFR"
fi
# Create backslash string
bs="\\"
#create regions to extract
region_file="${name}_region.txt"
echo -e ${chr}'\t'${start}'\t'${end} > ${region_file}
# Create script file
cat <<EOT>> ${plink_script}
/home/n/nnp5/software/plink2 $bs
--bfile $ref_ld $bs
--extract range ${region_file} $bs
--make-bed $bs
--out ${name}
EOT
sh ${plink_script}
# Now recode
plink_scriptR="${name}_plink_recode.sh"
if [ -f "$plink_script" ]; then
echo "PLINK script exists, now deleted and created a new one"
rm $plink_scriptR
fi
# Create script file
cat <<EOT >> ${plink_scriptR}
/home/n/nnp5/software/plink2 $bs
--bfile ${name} $bs
--out ${name} $bs
--recode A
EOT
sh ${plink_scriptR}
done < ${pairs_lookup_file}
src/coloc/003_submit_coloc_susie_GTEx.sh
#!/bin/bash
#SBATCH --job-name=coloc_susie_gtex
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=0:30:0
#SBATCH --mem=32gb
#SBATCH --account=gen1
#SBATCH --export=NONE
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"
i=$SLURM_ARRAY_TASK_ID
read GENE < <(awk -F'\t' 'NR == '$((i))' { print $1 }' ${tmp_path}${CREDSET}_${TISSUE}_genes.txt)
module load R
Rscript ./src/coloc/003_run_coloc_susie_GTEx.R $TISSUE $CREDSET $GENE
#for c in ${!cs[*]}; do
# N=`cat ${tmp_path}${cs[c]}_${tissue[t]}_genes.txt | wc -l`
# sbatch --array=1-${N}%20 --export=TISSUE="${tissue}",CREDSET="${cs[c]}" ${tmp_path}003_submit_coloc_susie_GTEx.sh
# sleep 5
#done
src/coloc/003_run_coloc_susie_GTEx.R
#!/usr/bin/env Rscript
#Rationale: Run coloc and coloc.susie in GTExV8 tissues
suppressMessages(library(tidyverse))
library(coloc)
suppressMessages(library(Rfast))
args = commandArgs(trailingOnly = TRUE)
tissue = args[1]
cred_set = args[2]
gene = args[3]
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"
cred_set %>%
as_tibble %>%
separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
select(pheno) %>%
pull -> pheno
cred_set %>%
as_tibble %>%
separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
unite("signal", chr:a2) %>%
pull -> signal
############################
## Read asthma GWAS sumstat, eQTL GWAS, and LD matrix
############################
GWAS = read_delim(paste0(tmp_path, cred_set, "_GWASpairs.txt.gz")) %>%
rename(chr = b37chr, pos = bp, beta = LOG_ODDS, SE = se) %>%
select(-snpid) %>%
arrange(chr, pos) %>%
drop_na(beta) %>%
mutate(varbeta = SE^2) %>%
rename(position=pos)
GWAS$allele1.gwas <- GWAS$a1
GWAS <- GWAS %>% mutate(snp = paste0(chr, "_", position, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
select(c(snp,beta,SE,eaf,pval,MAF,varbeta,allele1.gwas)) %>%
distinct(snp, .keep_all=TRUE)
GWAS$N <- as.numeric(38405)
eqtlGWAS = read_delim(paste0(tmp_path, cred_set, "_", tissue, "_pairs.txt.gz")) %>%
mutate(ID = gsub(x=ID, pattern=":", replacement="_"),
varbeta = se^2) %>%
arrange(chrom, pos) %>%
drop_na(beta) %>%
filter(gene_id==gene,
ID %in% GWAS$snp) %>%
rename(snp=ID, position=pos, MAF=maf)
GWAS %>% filter(snp %in% eqtlGWAS$snp) -> GWAS
############################
#Do colocalisation ONLY IF GWAS and eqtlGWAS (eQTL tissue-gene) contains pvalue <= 5x10-6.
############################
GWAS_sign <- GWAS %>% filter(pval <= 0.000005)
eqtlGWAS_sign <- eqtlGWAS %>% filter(pval <= 0.000005)
if (dim(eqtlGWAS_sign)[1] < 1){
stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No colocalisation is possible because eQTL data has pvalue >= 5x10-6."))
}
if (dim(GWAS_sign)[1] < 1){
stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No colocalisation is possible because GWAS data has pvalue >= 5x10-6."))
} else {
paste0(signal," ",tissue, " ", gene, " GTExv8: Starting colocalisation because both GWAS and eQTL data has pvalue <= 5x10-6.")
}
############################
# * if also want coloc results
############################
coloc_all = coloc.abf(dataset1=list(beta=GWAS$beta, varbeta=GWAS$varbeta,
N=GWAS$N, type="cc", MAF=GWAS$MAF, snp=GWAS$snp),
dataset2=list(beta=eqtlGWAS$beta, varbeta=eqtlGWAS$varbeta,
N=eqtlGWAS$N, type="quant", MAF=eqtlGWAS$MAF, snp=eqtlGWAS$snp))
saveRDS(coloc_all, paste0(tmp_path,"results/gtex/",cred_set, "_", tissue, "_", gene, "_all_coloc.rds"))
#look at the results:
coloc <- readRDS(paste0(tmp_path,"results/gtex/", cred_set, "_", tissue, "_", gene, "_all_coloc.rds"))
#sensitivity plot:
sensitivity(coloc,"H4 > 0.8")
#
############################
##COLOC.SUSIE to allow presence of multiple causal variants
############################
# 1) Format LD matrix
############################
ld = data.table::fread(
paste0(tmp_path, signal, "_", pheno, "_", tissue, ".raw")
) %>%
select(!c(FID:PHENOTYPE))
ld %>%
names %>%
as_tibble %>%
separate(value,c("c", "p", "a1", "a2")) %>%
mutate(snp = paste0(c, "_", p, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
select(snp) %>%
pull -> snps
ld = t(ld) %>%
as_tibble %>%
mutate(var=apply(., 1, var, na.rm=TRUE),
snp=snps) %>%
filter(var!=0) %>%
filter(snp %in% GWAS$snp) %>%
filter(snp %in% eqtlGWAS$snp)
N = ncol(ld)-2
X = t(as.matrix(ld[,1:N]))
colnames(X) = ld %>%
select(snp) %>%
pull # Sdd names to facilitate filtering Nas (still some NAs even with pairwise.complete.obs)
LDmatrix = cor(X, use="pairwise.complete.obs") # Pearson's correlation, takes some time...
LDmatrix[is.na(LDmatrix)] <- 0
############################
# 2) Format asthma GWAS
############################
bim = read_tsv(
paste0(tmp_path, signal, "_", pheno, "_", tissue, ".bim"),
col_names=c("chr", "snp", "morgans", "position", "allele1", "allele2")
) %>%
select(!morgans) # Read bim file to check same set of SNPs and allele alignment
GWAS_df <- GWAS %>%
filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
inner_join(bim) %>%
mutate(beta=ifelse(allele1!=allele1.gwas, -(beta), beta)) # Check allele alignment
#Check GWAS has at least one variant with pval <= 0.000005:
GWAS_sign <- GWAS_df %>% filter(pval <= 0.000005)
if (dim(GWAS_sign)[1] < 1){
stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No coloc.susie is possible because GWAS data has pvalue >= 5x10-6."))
}
GWAS_df %>%
as.list -> GWAS
GWAS$type = "cc"
N = as.integer(mean(GWAS$N))
GWAS$N = N
GWAS$LD = LDmatrix
############################
# 3) Format eQTL GWAS
############################
eqtlGWAS_df <- eqtlGWAS %>%
filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
inner_join(bim) %>%
mutate(beta=ifelse(allele1!=ref, -(beta), beta)) # Check allele alignment
#Check eqtlGWAS has at least one variant with pval <= 0.000005:
eqtlGWAS_sign <- eqtlGWAS_df %>% filter(pval <= 0.000005)
if (dim(eqtlGWAS_sign)[1] < 1){
stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No coloc.susie is possible because eQTL-GWAS data has pvalue >= 5x10-6."))
} else {
paste0(signal," ",tissue, " ", gene, " GTExv8: Starting coloc.susie because both GWAS and eQTL data has pvalue <= 5x10-6.")
}
eqtlGWAS <- eqtlGWAS_df %>% as.list
eqtlGWAS$type = "quant"
eqtlGWAS$LD = LDmatrix
N = as.integer(mean(eqtlGWAS$N))
eqtlGWAS$N = N
############################
# Check datasets are ok
############################
check_dataset(eqtlGWAS, req="LD") -> eqtlGWAS_check
check_dataset(GWAS, req="LD") -> GWAS_check
if (is.null(GWAS_check) & is.null(eqtlGWAS_check)){
cat(paste0(cred_set, "_", tissue, "_", gene, ": ok", "\n"),
file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
} else {
cat(paste0(cred_set, "_", tissue, "_", gene, ": warning", "\n"),
file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
}
############################
## Run SuSie
############################
susie_GWAS = runsusie(GWAS, r2.prune=0.2, check_R=FALSE)
susie_eQTLGWAS = runsusie(eqtlGWAS, r2.prune=0.2, check_R=FALSE)
susie_all = coloc.susie(susie_GWAS, susie_eQTLGWAS)
saveRDS(susie_all, paste0(tmp_path,"results/gtex/", cred_set, "_", tissue, "_", gene, "_all_susie.rds"))
write_tsv(susie_all$summary %>% as_tibble,
paste0(tmp_path, "results/gtex/", cred_set, "_", tissue, "_", gene, "_susie_summary.tsv"))
src/coloc/000_submit_edit_eQTLGen.sh
#!/bin/bash
#SBATCH --job-name=edit_eQTLGen
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=8:0:0
#SBATCH --mem=80gb
#SBATCH --account=gen1
#SBATCH --export=NONE
module load R
Rscript src/coloc/000_run_edit_eQTLGen.R
src/coloc/000_run_edit_eQTLGen.R
#!/usr/bin/env Rscript
#==================================================================
# File: 000_run_edit_eQTL.R
# Project: SevereAsthma
# Author: NNP - edited from AW
# Date: 1 November 2023
# Rationale: Load eqtlGen data to add allele frequency
#==================================================================
library(tidyverse)
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen"
freq = read_tsv("/data/gen1/LF_HRC_transethnic/eQTL/eQTLgen/2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt.gz",
col_select=c("hg19_chr", "hg19_pos", "AlleleA", "AlleleB", "AlleleB_all")) %>%
mutate(ID = paste0(hg19_chr, ":", hg19_pos, "_", pmin(AlleleA, AlleleB), "_", pmax(AlleleA, AlleleB))) %>%
select(ID, AssessedAllele_freq = AlleleB_all)
eqtl = read_tsv("/data/gen1/reference/eqtlgen/2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz") %>%
mutate(ID = paste0(SNPChr, ":", SNPPos, "_", pmin(AssessedAllele, OtherAllele), "_", pmax(AssessedAllele, OtherAllele))) %>%
relocate(ID, .before="SNP") %>%
left_join(freq) %>%
mutate(beta = Zscore / sqrt(2 * AssessedAllele_freq * (1 - AssessedAllele_freq) * (NrSamples + Zscore^2)),
se = 1 / sqrt(2 * AssessedAllele_freq * (1 - AssessedAllele_freq) * (NrSamples + Zscore^2)))
write_tsv(x=eqtl, file=paste0(tmp_path,"/whole_blood_cis_eqtls_withAF.txt.gz"))
src/coloc/001_submit_eqtl_lookup_GTEx.sh
#!/bin/bash
#SBATCH --job-name=SA_eqtl_lookup
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=0:30:0
#SBATCH --mem=16gb
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
#SBATCH --account=gen1
#SBATCH --export=NONE
#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"
module load R
Rscript ./src/coloc/001_run_eqtl_lookup_GTEx.R $TISSUE $CREDSET
src/coloc/001_run_eqtl_lookup_GTEx.R
#!/usr/bin/env Rscript
#Rationale: Create GTExv8 eQTL files to use for colocalisation, filter genes for each eQTL files, find GWAS sumstat lookup
#in GTEXv8 for regions that I cannot do colocalisation, e.i. HLA regions.
library(data.table)
library(tidyverse)
args = commandArgs(trailingOnly = TRUE)
tissue = args[1]
cred_set = args[2]
tmp = unlist(strsplit(cred_set, split="_"))
chr = as.numeric(tmp[2])
location = as.numeric(tmp[3])
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"
##allele are place in alphabetical order ! (no dependency on effect allele, reference/alternative)
#eqtl:
#'Colon_Transverse' 'Colon_Sigmoid' 'Skin_Sun_Exposed_Lower_leg' 'Skin_Not_Sun_Exposed_Suprapubic' files are in my scratch;
#make an exeption for these files:
my_tissues <- c("Colon_Transverse", "Colon_Sigmoid", "Skin_Sun_Exposed_Lower_leg", "Skin_Not_Sun_Exposed_Suprapubic")
if (tissue %in% my_tissues ){
eqtl = fread(paste0(tmp_path,"liftover_gtexv8/", tissue, ".v8.EUR.allpairs.chr", chr, ".hg19.txt.gz"),
select=c("phenotype_id", "CHROM", "POS", "REF", "ALT", "maf", "ma_samples", "slope", "slope_se", "pval_nominal"),
col.names=c("gene_id", "chrom", "pos", "ref", "alt", "maf", "N", "beta", "se", "pval"),
showProgress=FALSE) %>%
mutate(ID = paste0(chrom, ":", pos, "_", pmin(ref, alt), "_", pmax(ref, alt))) %>%
relocate(ID, .before="gene_id") %>%
filter(between(pos, location-5e5, location+5e5), chrom==chr)
} else {
eqtl = fread(paste0("/data/gen1/ACEI/colocalisation_datasets/eQTL/GTeX/", tissue, ".v8.EUR.allpairs.chr", chr, ".hg19.txt.gz"),
select=c("phenotype_id", "CHROM", "POS", "REF", "ALT", "maf", "ma_samples", "slope", "slope_se", "pval_nominal"),
col.names=c("gene_id", "chrom", "pos", "ref", "alt", "maf", "N", "beta", "se", "pval"),
showProgress=FALSE) %>%
mutate(ID = paste0(chrom, ":", pos, "_", pmin(ref, alt), "_", pmax(ref, alt))) %>%
relocate(ID, .before="gene_id") %>%
filter(between(pos, location-5e5, location+5e5), chrom==chr)
}
genes = eqtl %>% distinct(gene_id)
write_delim(x=eqtl, file=paste0(tmp_path, cred_set, "_", tissue, "_pairs.txt.gz"))
write_delim(x=genes, file=paste0(tmp_path, cred_set, "_", tissue, "_genes.txt"), col_names=FALSE)
#import gwas sumstat:
gwas_sumstat <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat")
gwas_sumstat <- gwas_sumstat %>% setnames(c("b37chr","bp","a1","a2"),c("chrom","position","allele1","allele2"))
#credible set:
cs <- fread(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/", cred_set))
cs <- cs %>% setnames("chromosome","chrom")
#merge credible set with gwas sumstat:
cs_gwas_sumstat <- inner_join(cs,gwas_sumstat, by=c("chrom","position","allele1","allele2","snpid"))
#merge credible set-gwas sumstat with eqtl:
cs_gwas_sumstat_eqtl <- cs_gwas_sumstat %>% setnames(c("position","LOG_ODDS"),c("pos","beta")) %>%
mutate(ID = paste0(chrom, ":", pos, "_", pmin(allele1, allele2), "_", pmax(allele1, allele2))) %>%
relocate(ID, .before="chrom") %>%
distinct() %>%
left_join(eqtl, by="ID", suffix=c(".gwas", ".eqtl")) %>%
mutate(beta.eqtl = if_else(alt == allele1, beta.eqtl, -beta.eqtl))
write_delim(x=cs_gwas_sumstat_eqtl, file=paste0(tmp_path, cred_set, "_", tissue, "_lookup.txt.gz"))
src/coloc/002_prepare_LDinput_eqtlgen.R
#!/usr/bin/env Rscript
#Rationale: combine names in a document to create LD .raw file for eQTLGen
library(tidyverse)
args = commandArgs(trailingOnly = TRUE)
tissue = args[1]
#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen/"
df <- data.frame(cred_set = c("SA_8_81292599_C_A","SA_6_90963614_AT_A","SA_5_110401872_C_T",
"SA_2_242692858_T_C","SA_15_67442596_T_C","SA_12_56435504_C_G","SA_11_76296671_G_A",
"SA_9_6209697_G_A","SA_5_131885240_C_G","SA_3_33042712_C_T",
"SA_2_102913642_AAAAC_A","SA_17_38168828_G_A","SA_16_27359021_C_A","SA_15_61068954_C_T",
"SA_12_48202941_T_C","SA_10_9064716_C_T")) %>%
as_tibble %>%
separate(cred_set, c("pheno", "chr", "pos", "a1", "a2"), sep="_")
df$ancestry <- "EUR"
df <- df %>%
unite("snp", chr:a2, remove=FALSE) %>%
mutate(eQTL = tissue,
start = as.numeric(pos) - 5e5,
end = as.numeric(pos) + 5e5) %>%
select(pheno, ancestry, snp, chr, pos, eQTL, start, end)
write_tsv(x=df, file=paste0(tmp_path,tissue,"_Pairs_lookup.txt"),col_names=F, append=T)
src/coloc/003_submit_coloc_susie_eQTLGen.sh
#!/bin/bash
#SBATCH --job-name=coloc_susie_eqtlgen
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=0:30:0
#SBATCH --mem=32gb
#SBATCH --account=gen1
#SBATCH --export=NONE
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen/"
i=$SLURM_ARRAY_TASK_ID
read GENE < <(awk -F'\t' 'NR == '$((i))' { print $1 }' ${tmp_path}${CREDSET}_eqtlGenWB_genes.txt)
module load R
Rscript src/coloc_UBClung/003_run_coloc_susie_lung_eQTL.r "eqtlGenWB" $CREDSET $GENE
src/coloc/003_run_coloc_susie_eQTLGen.R
#!/usr/bin/env Rscript
#Rationale: Run coloc and coloc.susie in eQTLGen wholeblood tissue
suppressMessages(library(tidyverse))
library(coloc)
suppressMessages(library(Rfast))
args = commandArgs(trailingOnly = TRUE)
tissue = args[1]
cred_set = args[2]
gene = args[3]
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen/"
cred_set %>%
as_tibble %>%
separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
select(pheno) %>%
pull -> pheno
cred_set %>%
as_tibble %>%
separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
unite("signal", chr:a2) %>%
pull -> signal
############################
## Read asthma GWAS sumstat, eQTL GWAS, and LD matrix
############################
GWAS = read_delim(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/", cred_set, "_GWASpairs.txt.gz")) %>%
rename(chr = b37chr, pos = bp, beta = LOG_ODDS, SE = se) %>%
select(-snpid) %>%
arrange(chr, pos) %>%
drop_na(beta) %>%
mutate(varbeta = SE^2) %>%
rename(position=pos)
GWAS$allele1.gwas <- GWAS$a1
GWAS <- GWAS %>% mutate(snp = paste0(chr, "_", position, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
select(c(snp,beta,SE,eaf,pval,MAF,varbeta,allele1.gwas)) %>%
distinct(snp, .keep_all=TRUE)
GWAS$N <- as.numeric(38405)
eqtlGWAS = read_delim(paste0(tmp_path, cred_set, "_eqtlGenWB_pairs.txt.gz")) %>%
mutate(ID = gsub(x=ID, pattern=":", replacement="_"),
varbeta = se^2,
MAF = if_else(AssessedAllele_freq <0.5, AssessedAllele_freq, 1-AssessedAllele_freq)) %>%
arrange(chrom, pos) %>%
drop_na(beta) %>%
filter(GeneSymbol==gene,
ID %in% GWAS$snp) %>%
rename(snp=ID, position=pos, N=NrSamples)
GWAS %>% filter(snp %in% eqtlGWAS$snp) -> GWAS
############################
#Do colocalisation ONLY IF GWAS and eqtlGWAS (eQTL tissue-gene) contains pvalue <= 5x10-6.
############################
GWAS_sign <- GWAS %>% filter(pval <= 0.000005)
eqtlGWAS_sign <- eqtlGWAS %>% filter(pval <= 0.000005)
if (dim(eqtlGWAS_sign)[1] < 1){
stop(paste0(signal," ",tissue, " ", gene, ": No colocalisation is possible because eQTL data has pvalue >= 5x10-6."))
}
if (dim(GWAS_sign)[1] < 1){
stop(paste0(signal," ",tissue, " ", gene, ": No colocalisation is possible because GWAS data has pvalue >= 5x10-6."))
} else {
paste0(signal," ",tissue, " ", gene, ": Starting colocalisation because both GWAS and eQTL data has pvalue <= 5x10-6.")
}
############################
# * if also want coloc results
############################
coloc_all = coloc.abf(dataset1=list(beta=GWAS$beta, varbeta=GWAS$varbeta,
N=GWAS$N, type="cc", MAF=GWAS$MAF, snp=GWAS$snp),
dataset2=list(beta=eqtlGWAS$beta, varbeta=eqtlGWAS$varbeta,
N=eqtlGWAS$N, type="quant", MAF=eqtlGWAS$MAF, snp=eqtlGWAS$snp))
saveRDS(coloc_all, paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/eqtlgen/", cred_set, "_eqtlGenWB_", gene, "_all_coloc.rds"))
############################
# 1) Format LD matrix
############################
ld = data.table::fread(
paste0(tmp_path, signal, "_", pheno, "_", tissue, ".raw")
) %>%
select(!c(FID:PHENOTYPE))
ld %>%
names %>%
as_tibble %>%
separate(value,c("c", "p", "a1", "a2")) %>%
mutate(snp = paste0(c, "_", p, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
select(snp) %>%
pull -> snps
ld = t(ld) %>%
as_tibble %>%
mutate(var=apply(., 1, var, na.rm=TRUE),
snp=snps) %>%
filter(var!=0) %>%
filter(snp %in% GWAS$snp) %>%
filter(snp %in% eqtlGWAS$snp)
N = ncol(ld)-2
X = t(as.matrix(ld[,1:N]))
colnames(X) = ld %>%
select(snp) %>%
pull # Sdd names to facilitate filtering Nas (still some NAs even with pairwise.complete.obs)
LDmatrix = cor(X, use="pairwise.complete.obs") # Pearson's correlation, takes some time...
LDmatrix[is.na(LDmatrix)] <- 0
############################
# 2) Format asthma GWAS
############################
bim = read_tsv(
paste0(tmp_path, signal, "_", pheno, "_", tissue, ".bim"),
col_names=c("chr", "snp", "morgans", "position", "allele1", "allele2")
) %>%
select(!morgans) # Read bim file to check same set of SNPs and allele alignment
GWAS_df <- GWAS %>%
filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
inner_join(bim) %>%
mutate(beta=ifelse(allele1!=allele1.gwas, -(beta), beta)) # Check allele alignment
#Check GWAS has at least one variant with pval <= 0.000005:
GWAS_sign <- GWAS_df %>% filter(pval <= 0.000005)
if (dim(GWAS_sign)[1] < 1){
stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No coloc.susie is possible because GWAS data has pvalue >= 5x10-6."))
}
GWAS_df %>%
as.list -> GWAS
GWAS$type = "cc"
N = as.integer(mean(GWAS$N))
GWAS$N = N
GWAS$LD = LDmatrix
############################
# 3) Format eQTL GWAS
############################
eqtlGWAS_df <- eqtlGWAS %>%
filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
inner_join(bim) %>%
mutate(beta=ifelse(allele1!=AssessedAllele, -(beta), beta)) # Check allele alignment
#Check eqtlGWAS has at least one variant with pval <= 0.000005:
eqtlGWAS_sign <- eqtlGWAS_df %>% filter(pval <= 0.000005)
if (dim(eqtlGWAS_sign)[1] < 1){
stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No coloc.susie is possible because eQTL-GWAS data has pvalue >= 5x10-6."))
} else {
paste0(signal," ",tissue, " ", gene, " GTExv8: Starting coloc.susie because both GWAS and eQTL data has pvalue <= 5x10-6.")
}
eqtlGWAS <- eqtlGWAS_df %>% as.list
eqtlGWAS$type = "quant"
eqtlGWAS$LD = LDmatrix
N = as.integer(mean(eqtlGWAS$N))
eqtlGWAS$N = N
############################
# Check datasets are ok
############################
check_dataset(eqtlGWAS, req="LD") -> eqtlGWAS_check
check_dataset(GWAS, req="LD") -> GWAS_check
if (is.null(GWAS_check) & is.null(eqtlGWAS_check)){
cat(paste0(cred_set, "_", tissue, "_", gene, ": ok", "\n"),
file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
} else {
cat(paste0(cred_set, "_", tissue, "_", gene, ": warning", "\n"),
file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
}
############################
## Run SuSie
############################
susie_GWAS = runsusie(GWAS, r2.prune=0.2, check_R=FALSE)
susie_eQTLGWAS = runsusie(eqtlGWAS, r2.prune=0.2, check_R=FALSE, verbose=TRUE)
susie_all = coloc.susie(susie_GWAS, susie_eQTLGWAS)
saveRDS(susie_all, paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/eqtlgen/", cred_set, "_", tissue, "_", gene, "_all_susie.rds"))
write_tsv(susie_all$summary %>% as_tibble,
paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/eqtlgen/", cred_set, "_", tissue, "_", gene, "_susie_summary.tsv"))
src/coloc/004_concat_coloc_results.R
#!/usr/bin/env Rscript
#Rationale: Collate results together and extract significant Variant/Locus-Tissue-Gene colocalisation, from GTExV8 and eQTLGen
library(plyr)
library(tidyverse)
library(purrr)
library(xlsx)
###############
### GTEx v8 ###
###############
print("GTExV8 eQTL coloc results")
# .rds files containing standard coloc results with eQTLs
setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/gtex/")
file_paths_gtex = list.files(pattern="_all_coloc.rds")
# Manipulate file names to get various bits of info
file_paths_gtex %>%
as_tibble %>%
mutate(file = gsub(x=value, pattern="_all_coloc.rds", replacement="")) %>%
separate(col=file,
into=c("pheno", "chr", "pos", "a1", "a2", "t1", "t2", "t3", "t4", "t5", "t6"),
sep="_",
remove=FALSE) -> file_split
# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
gene = sapply(tmp, tail, 1)
tissue_fct = function(df) {
x = paste0(df[7:(length(df)-1)], sep="_", collapse="")
x = tolower(substr(x, 1, nchar(x)-1))
}
tissue = sapply(tmp, tissue_fct)
# Add gene and tissue to above dataframe
file_split %>%
mutate(gene = gene,
tissue = tissue) %>%
unite("snp", chr:a2) %>%
select(file=value, pheno, snp, gene, tissue) -> file_split
# Read all .rds files
data_list = lapply(file_paths_gtex, readRDS)
# Function to extract coloc information from results
summary_fct = function(df) {
return(df$summary)
}
# Collapse list of coloc results into single dataframe and add a few new columns
data_summary = ldply(lapply(data_list, summary_fct)) %>%
as_tibble %>%
bind_cols(file_split) %>%
select(-file) %>%
mutate(coloc = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
arrange(desc(PP.H4.abf))
# Check if there are any colocalisations
data_summary %>% filter(coloc) %>% select(snp, pheno, gene)
# Save all results to file
print("GTExV8 eQTL coloc.susie results")
write_tsv(x=data_summary, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/coloc_asthma_GTEx.tsv")
# .rds files containing coloc.susie results with eQTLs
setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/gtex/")
file_paths_gtex = list.files(pattern="_all_susie.rds")
# Manipulate file names to get various bits of info
file_paths_gtex %>%
as_tibble %>%
mutate(file = gsub(x=value, pattern="_all_susie.rds", replacement="")) %>%
separate(col=file,
into=c("pheno", "chr", "pos", "a1", "a2", "t1", "t2", "t3", "t4", "t5", "t6"),
sep="_",
remove=FALSE) -> file_split
# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
gene = sapply(tmp, tail, 1)
tissue_fct = function(df) {
x = paste0(df[7:(length(df)-1)], sep="_", collapse="")
x = tolower(substr(x, 1, nchar(x)-1))
}
tissue = sapply(tmp, tissue_fct)
# Add gene and tissue to above dataframe
file_split %>%
mutate(gene = gene,
tissue = tissue) %>%
unite("snp", chr:a2) %>%
select(file=value, pheno, snp, gene, tissue) -> file_split
# Read all .rds files
data_list = lapply(file_paths_gtex, readRDS)
# Function to extract coloc.susie information from results
summary_fct = function(df) {
return(df$summary)
}
# Collapse list of coloc results into single dataframe and add a few new columns
##Additional commands to work around summary with NULL data:
tmp_1 <- lapply(data_list, summary_fct)
nonnull <- sapply(tmp_1, typeof)!="NULL" # find all NULLs to omit
ID <- file_split$snp
index_row <- 1:937 #the length of tmp_1 which is the total nuber of all_susie.rds files
tmp_2 <- map2(tmp_1, ID, ~cbind(.x, snp = .y))
tmp_3 <- map2(tmp_2, index_row, ~cbind(.x, n_index = .y))
tmp_4 <- tmp_3[nonnull]
file_split$n_index <- index_row
data_summary_colocsusie = ldply(tmp_4) %>%
as_tibble %>%
left_join(file_split,by=c("snp","n_index")) %>%
select(-file) %>%
mutate(coloc_susie = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
arrange(desc(PP.H4.abf))
# Check if there are any colocalisations
data_summary_colocsusie %>% filter(coloc_susie) %>% select(snp, pheno, gene)
# Save all results to file
write_tsv(x=data_summary_colocsusie, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/colocsusie_asthma_GTEx.tsv")
#SAVE COLOC AND COLOC.SUSIE GENES INTO XLSX FILE:
gene_coloc <- as.data.frame(data_summary %>% filter(coloc) %>% select(gene))
gene_colocsusie <- as.data.frame(data_summary_colocsusie %>% filter(coloc_susie) %>% select(gene))
gene_gtex <- rbind(gene_coloc,gene_colocsusie) %>% unique()
write.xlsx(gene_gtex,"/alice-home/3/n/nnp5/PhD/PhD_project/Var_to_Gene/input/var2genes_raw.xlsx", sheetName = "GTExV8_eQTL_genes", row.names=FALSE, col.names=FALSE, append=TRUE)
###############
### eqtlGen ###
###############
print("eqtlGen eQTL coloc results")
# .rds files containing standard coloc results with eQTLs
setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/eqtlgen/")
file_paths_eqtlgen = list.files(pattern="_all_coloc.rds")
# Manipulate file names to get various bits of info
file_paths_eqtlgen %>%
as_tibble %>%
mutate(file = gsub(x=value, pattern="_all_coloc.rds", replacement="")) %>%
separate(col=file,
into=c("pheno", "chr", "pos", "a1", "a2", "t1"),
sep="_",
remove=FALSE) -> file_split
# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
gene = sapply(tmp, tail, 1)
tissue_fct = function(df) {
x = paste0(df[6])
}
tissue = sapply(tmp, tissue_fct)
# Add gene and tissue to above dataframe
file_split %>%
mutate(gene = gene,
tissue = tissue) %>%
unite("snp", chr:a2) %>%
select(file=value, pheno, snp, gene, tissue) -> file_split
# Read all .rds files
data_list = lapply(file_paths_eqtlgen, readRDS)
# Function to extract coloc information from results
summary_fct = function(df) {
return(df$summary)
}
# Collapse list of coloc results into single dataframe and add a few new columns
data_summary = ldply(lapply(data_list, summary_fct)) %>%
as_tibble %>%
bind_cols(file_split) %>%
select(-file) %>%
mutate(coloc = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
arrange(desc(PP.H4.abf))
# Check if there are any colocalisations
data_summary %>% filter(coloc) %>% select(snp, pheno, gene)
# Save all results to file
write_tsv(x=data_summary, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/coloc_asthma_eqtlgen.tsv")
# .rds files containing coloc.susie results with eQTLs
file_paths_eqtlgen= list.files(pattern="_all_susie.rds")
# Manipulate file names to get various bits of info
file_paths_eqtlgen %>%
as_tibble %>%
mutate(file = gsub(x=value, pattern="_all_susie.rds", replacement="")) %>%
separate(col=file,
into=c("pheno", "chr", "pos", "a1", "a2", "t1"),
sep="_",
remove=FALSE) -> file_split
# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
gene = sapply(tmp, tail, 1)
tissue_fct = function(df) {
x = paste0(df[6])
}
tissue = sapply(tmp, tissue_fct)
# Add gene and tissue to above dataframe
file_split %>%
mutate(gene = gene,
tissue = tissue) %>%
unite("snp", chr:a2) %>%
select(file=value, pheno, snp, gene, tissue) -> file_split
# Read all .rds files
data_list = lapply(file_paths_eqtlgen, readRDS)
# Function to extract coloc.susie information from results
summary_fct = function(df) {
return(df$summary)
}
# Collapse list of coloc results into single dataframe and add a few new columns
##Additional commands to work around summary with NULL data:
tmp_1 <- lapply(data_list, summary_fct)
nonnull <- sapply(tmp_1, typeof)!="NULL" # find all NULLs to omit
ID <- file_split$snp
index_row <- 1:169 #the length of tmp_1 which is the total nuber of all_susie.rds files
tmp_2 <- map2(tmp_1, ID, ~cbind(.x, snp = .y))
tmp_3 <- map2(tmp_2, index_row, ~cbind(.x, n_index = .y))
tmp_4 <- tmp_3[nonnull]
file_split$n_index <- index_row
data_summary_colocsusie = ldply(tmp_4) %>%
as_tibble %>%
left_join(file_split,by=c("snp","n_index")) %>%
select(-file) %>%
mutate(coloc_susie = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
arrange(desc(PP.H4.abf))
# Check if there are any colocalisations
data_summary_colocsusie %>% filter(coloc_susie) %>% select(snp, pheno, gene)
# Save all results to file
write_tsv(x=data_summary_colocsusie, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/colocsusie_asthma_eqtlgen.tsv")
#SAVE COLOC AND COLOC.SUSIE GENES INTO XLSX FILE:
#coloc.susie does not have any significant colocalisation:
gene_coloc <- as.data.frame(data_summary %>% filter(coloc) %>% select(gene))
write.xlsx(gene_coloc,"/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/var2genes_raw.xlsx", sheetName = "eqtlGen_eQTL_genes", row.names=FALSE, col.names=FALSE, append=TRUE)
src/coloc_UBClung/000_submit_lookup_lung_eQTL.sh
#!/bin/bash
#SBATCH --job-name=lookup_lungeqtl
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=1:0:0
#SBATCH --mem=30gb
#SBATCH --account=gen1
#SBATCH --export=NONE
module load R
Rscript src/coloc_UBClung/000_run_lookup_lung_eQTL.r "UBCLung" $CREDSET
#To be submitted as:
#for c in ${!cs[*]}; do
#
# sbatch --export=CREDSET="${cs[c]}" ./src/coloc_UBClung/000_submit_lookup_lung_eQTL.sh
#
#done
src/coloc_UBClung/000_run_lookup_lung_eQTL.r
#!/usr/bin/env Rscript
#Rationale: Lookup of overlapping region between ssevere asthma GWAS and UBC Lung eQTL significant genes in order to
#perform, colocalisation only in regions that have significant associations in both studies.
sink(stderr())
suppressMessages(library(data.table))
suppressMessages(library(tidyverse))
suppressMessages(library(dplyr))
args = commandArgs(trailingOnly = TRUE)
tissue = args[1] #"UBCLung"
cred_set = args[2]
tmp = unlist(strsplit(cred_set, split="_"))
chr_sentinel = as.numeric(tmp[2])
location_sentinel = as.numeric(tmp[3])
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung/"
##allele are place in alphabetical order ! (no dependency on effect allele, reference/alternative)
eGene <- fread("/data/gen1/LF_HRC_transethnic/eQTL/lung_eQTL/eGene_list.txt")
eGene$chr <- gsub("_[0-9]*_[A-Z]*_[A-Z]*","",eGene$SNP)
eGene$bp <- gsub("^[0-9]*_","",eGene$SNP)
eGene$bp <- gsub("_[A-Z]*_[A-Z]*","",eGene$bp)
eGENE_lookup <- eGene %>% filter(chr == chr_sentinel,
bp >= location_sentinel-1000000,
bp <= location_sentinel+1000000)
eGENE_lookup <- eGENE_lookup %>% mutate(sentinel=cred_set)
probeset <- eGENE_lookup %>% distinct(ProbeSet)
#colnames: SNP ProbeSet p_value BF TESTS chr bp sentinel
write_delim(x=eGENE_lookup, file=paste0(tmp_path, cred_set,"_", tissue, "_pairs.txt.gz"))
write_delim(x=probeset, file=paste0(tmp_path, cred_set,"_", tissue, "_probesets.txt"), col_names=FALSE)
src/coloc_UBClung/002_prepare_LDinput_ubclung.R
#!/usr/bin/env Rscript
#Rationale: combine names in a document to create LD .raw file for UBCLung
library(tidyverse)
args = commandArgs(trailingOnly = TRUE)
tissue = args[1]
#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung/"
df <- data.frame(cred_set = c("SA_8_81292599_C_A","SA_6_90963614_AT_A","SA_5_110401872_C_T",
"SA_2_242692858_T_C","SA_15_67442596_T_C","SA_12_56435504_C_G","SA_11_76296671_G_A",
"SA_9_6209697_G_A","SA_5_131885240_C_G","SA_3_33042712_C_T",
"SA_2_102913642_AAAAC_A","SA_17_38168828_G_A","SA_16_27359021_C_A","SA_15_61068954_C_T",
"SA_12_48202941_T_C","SA_10_9064716_C_T")) %>%
as_tibble %>%
separate(cred_set, c("pheno", "chr", "pos", "a1", "a2"), sep="_")
df$ancestry <- "EUR"
df <- df %>%
unite("snp", chr:a2, remove=FALSE) %>%
mutate(eQTL = tissue,
start = as.numeric(pos) - 5e5,
end = as.numeric(pos) + 5e5) %>%
select(pheno, ancestry, snp, chr, pos, eQTL, start, end)
write_tsv(x=df, file=paste0(tmp_path,tissue,"_Pairs_lookup.txt"),col_names=F, append=T)
src/coloc_UBClung/003_submit_coloc_susie_lung_eQTL.sh
#!/bin/bash
#SBATCH --job-name=coloc_susie_UBCLung
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=0:30:0
#SBATCH --mem=32gb
#SBATCH --account=gen1
#SBATCH --export=NONE
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung/"
i=$SLURM_ARRAY_TASK_ID
read PROBESET < <(awk -F'\t' 'NR == '$((i))' { print $1 }' ${tmp_path}${CREDSET}_UBCLung_probesets.txt)
module load R
Rscript src/coloc_UBClung/003_run_coloc_susie_lung_eQTL.r "UBCLung" $CREDSET $PROBESET
src/coloc_UBClung/003_run_ coloc_susie_lung_eQTL.r
#!/usr/bin/env Rscript
#Rationale: Run coloc and coloc.susie in UBCLung eQTL
suppressMessages(library(tidyverse))
library(coloc)
suppressMessages(library(Rfast))
library(data.table)
args = commandArgs(trailingOnly = TRUE)
tissue = args[1]
cred_set = args[2]
probe = args[3]
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung/"
cred_set %>%
as_tibble %>%
separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
select(pheno) %>%
pull -> pheno
cred_set %>%
as_tibble %>%
separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
unite("signal", chr:a2) %>%
pull -> signal
##allele are place in alphabetical order ! (no dependency on effect allele, reference/alternative)
############################
## Read asthma GWAS sumstat, eQTL GWAS, and LD matrix
############################
GWAS = read_delim(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/", cred_set, "_GWASpairs.txt.gz")) %>%
rename(chr = b37chr, pos = bp, beta = LOG_ODDS, SE = se) %>%
select(-snpid) %>%
arrange(chr, pos) %>%
drop_na(beta) %>%
mutate(varbeta = SE^2) %>%
rename(position=pos)
GWAS$allele1.gwas <- GWAS$a1
GWAS <- GWAS %>% mutate(snp = paste0(chr, "_", position, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
select(c(snp,beta,SE,eaf,pval,MAF,varbeta,allele1.gwas)) %>%
distinct(snp, .keep_all=TRUE)
GWAS$N <- as.numeric(38405)
eQTL <- fread(paste0(tmp_path,"eQTL_region_stat/",cred_set,"_eQTL_region.txt",sep=""),header=T)
setnames(eQTL,"#Probe","ProbeSet")
eQTL <- eQTL %>% filter(ProbeSet==probe)
eQTL <- mutate(eQTL,snp=paste(CHR,BP,pmin(toupper(Allele1),toupper(Allele2)),pmax(toupper(Allele1),toupper(Allele2)),sep="_"))
eqtlGWAS <- eQTL %>% filter(snp %in% GWAS$snp)
setnames(eqtlGWAS,"Freq1","freq")
setnames(eqtlGWAS,"Effect","b")
setnames(eqtlGWAS,"StdErr","se")
setnames(eqtlGWAS,"P","pval")
eqtlGWAS$freq <- as.numeric(eqtlGWAS$freq)
eqtlGWAS$b <- as.numeric(eqtlGWAS$b)
eqtlGWAS$se <- as.numeric(eqtlGWAS$se)
eqtlGWAS$z <- eqtlGWAS$b/eqtlGWAS$se
eqtlGWAS$N <- 1/(2*eqtlGWAS$freq*(1-eqtlGWAS$freq)*eqtlGWAS$se*eqtlGWAS$se)-eqtlGWAS$z*eqtlGWAS$z
N = as.integer(mean(eqtlGWAS$N))
eqtlGWAS$N = N
eqtlGWAS <- mutate(eqtlGWAS,MAF=ifelse(freq>0.5,1-freq,freq))
eqtlGWAS <- mutate(eqtlGWAS,beta=ifelse(freq>0.5,-1*b,b))
eqtlGWAS <- eqtlGWAS %>% mutate(varbeta = se^2)
eqtlGWAS$allele1.eqtl <- toupper(eqtlGWAS$Allele1)
eqtlGWAS <- select(eqtlGWAS,"snp","MAF","beta","se","varbeta","pval","allele1.eqtl","N")
GWAS <- GWAS %>% filter(snp %in% eqtlGWAS$snp)
############################
# * if also want coloc results
############################
coloc_all = coloc.abf(dataset1=list(beta=GWAS$beta, varbeta=GWAS$varbeta,
N=GWAS$N, type="cc", MAF=GWAS$MAF, snp=GWAS$snp),
dataset2=list(beta=eqtlGWAS$beta, varbeta=eqtlGWAS$varbeta,
N=eqtlGWAS$N, type="quant", MAF=eqtlGWAS$MAF, snp=eqtlGWAS$snp))
saveRDS(coloc_all, paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/ubclung/", cred_set, "_ubclung_", probe, "_all_coloc.rds"))
########################
############################
# 1) Format LD matrix
############################
ld = data.table::fread(
paste0(tmp_path, signal, "_", pheno, "_", tissue, ".raw")
) %>%
select(!c(FID:PHENOTYPE))
ld %>%
names %>%
as_tibble %>%
separate(value,c("c", "p", "a1", "a2")) %>%
mutate(snp = paste0(c, "_", p, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
select(snp) %>%
pull -> snps
ld = t(ld) %>%
as_tibble %>%
mutate(var=apply(., 1, var, na.rm=TRUE),
snp=snps) %>%
filter(var!=0) %>%
filter(snp %in% GWAS$snp) %>%
filter(snp %in% eqtlGWAS$snp)
N = ncol(ld)-2
X = t(as.matrix(ld[,1:N]))
colnames(X) = ld %>%
select(snp) %>%
pull # Sdd names to facilitate filtering Nas (still some NAs even with pairwise.complete.obs)
LDmatrix = cor(X, use="pairwise.complete.obs") # Pearson's correlation, takes some time...
LDmatrix[is.na(LDmatrix)] <- 0
############################
# 2) Format asthma GWAS
############################
bim = read_tsv(
paste0(tmp_path, signal, "_", pheno, "_", tissue, ".bim"),
col_names=c("chr", "snp", "morgans", "position", "allele1", "allele2")
) %>%
select(!morgans) # Read bim file to check same set of SNPs and allele alignment
GWAS_df <- GWAS %>%
filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
inner_join(bim) %>%
mutate(beta=ifelse(allele1!=allele1.gwas, -(beta), beta)) # Check allele alignment
#Check GWAS has at least one variant with pval <= 0.000005:
GWAS_sign <- GWAS_df %>% filter(pval <= 0.000005)
if (dim(GWAS_sign)[1] < 1){
stop(paste0(signal," ",tissue, " ", probe, " UBClung: No coloc.susie is possible because GWAS data has pvalue >= 5x10-6."))
}
GWAS_df %>%
as.list -> GWAS
GWAS$type = "cc"
N = as.integer(mean(GWAS$N))
GWAS$N = N
GWAS$LD = LDmatrix
############################
# 3) Format eQTL GWAS
############################
eqtlGWAS_df <- eqtlGWAS %>%
filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
inner_join(bim) %>%
mutate(beta=ifelse(allele1!=allele1.eqtl, -(beta), beta)) # Check allele alignment
#Check eqtlGWAS has at least one variant with pval <= 0.000005:
eqtlGWAS_sign <- eqtlGWAS_df %>% filter(pval <= 0.000005)
if (dim(eqtlGWAS_sign)[1] < 1){
stop(paste0(signal," ",tissue, " ", probe, " UBClung: No coloc.susie is possible because eQTL-GWAS data has pvalue >= 5x10-6."))
} else {
paste0(signal," ",tissue, " ", probe, " UBClung: Starting coloc.susie because both GWAS and eQTL data has pvalue <= 5x10-6.")
}
eqtlGWAS <- eqtlGWAS_df %>% as.list
eqtlGWAS$type = "quant"
eqtlGWAS$LD = LDmatrix
N = as.integer(mean(eqtlGWAS$N))
eqtlGWAS$N = N
############################
# Check datasets are ok
############################
check_dataset(eqtlGWAS, req="LD") -> eqtlGWAS_check
check_dataset(GWAS, req="LD") -> GWAS_check
if (is.null(GWAS_check) & is.null(eqtlGWAS_check)){
cat(paste0(cred_set, "_", tissue, "_", probe, ": ok", "\n"),
file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
} else {
cat(paste0(cred_set, "_", tissue, "_", probe, ": warning", "\n"),
file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
}
############################
## Run SuSie
############################
susie_GWAS = runsusie(GWAS, r2.prune=0.2, check_R=FALSE)
susie_eQTLGWAS = runsusie(eqtlGWAS, r2.prune=0.2, check_R=FALSE, verbose=TRUE)
susie_all = coloc.susie(susie_GWAS, susie_eQTLGWAS)
saveRDS(susie_all, paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/ubclung/", cred_set, "_", tissue, "_", probe, "_all_susie.rds"))
write_tsv(susie_all$summary %>% as_tibble,
paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/ubclung/", cred_set, "_", tissue, "_", probe, "_susie_summary.tsv"))
src/coloc_UBClung/004_concat_coloc_susie_ubclung.R
#!/usr/bin/env Rscript
#Rationale: Collate results together and extract significant Variant/Locus-Tissue-Gene colocalisation, from UBCLung
library(plyr)
library(tidyverse)
library(purrr)
library(data.table)
#find probset that colocalised and map to gene using /data/gen1/reference/lung_eQTL/tabMerged_anno.txt
###############
### UBCLung ###
###############
print("UBCLung eQTL coloc results")
# .rds files containing standard coloc results with eQTLs
setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/ubclung/")
file_paths_ubclung = list.files(pattern="_all_coloc.rds")
# Manipulate file names to get various bits of info
file_paths_ubclung %>%
as_tibble %>%
mutate(file = gsub(x=value, pattern="_all_coloc.rds", replacement="")) %>%
separate(col=file,
into=c("pheno", "chr", "pos", "a1", "a2", "t1"),
sep="_",
remove=FALSE) -> file_split
# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
probe_1 = sapply(tmp, function(innerList) innerList[[length(innerList) - 1]])
probe_2 = sapply(tmp, tail, 1)
tissue_fct = function(df) {
x = paste0(df[6])
}
tissue = sapply(tmp, tissue_fct)
# Add gene and tissue to above dataframe
file_split %>%
mutate(probe_1 = probe_1,
probe_2 = probe_2,
tissue = tissue) %>%
unite("snp", chr:a2) %>%
select(file=value, pheno, snp, probe_1, probe_2, tissue) -> file_split
file_split$ProbeSet <- paste0(file_split$probe_1,"_",file_split$probe_2)
# Read all .rds files
data_list = lapply(file_paths_ubclung, readRDS)
# Function to extract coloc information from results
summary_fct = function(df) {
return(df$summary)
}
# Collapse list of coloc results into single dataframe and add a few new columns
data_summary = ldply(lapply(data_list, summary_fct)) %>%
as_tibble %>%
bind_cols(file_split) %>%
select(-file) %>%
mutate(coloc = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
arrange(desc(PP.H4.abf))
#map probset to gene:
gene_probe <- fread("/data/gen1/reference/lung_eQTL/tabMerged_anno.txt",header=T)
gene_probe$ProbeSet <- gsub("_at","",gene_probe$ProbeSet)
setnames(gene_probe,"gene","gene_id")
data_summary <- left_join(data_summary, gene_probe,by="ProbeSet")
# Check if there are any colocalisations
data_summary %>% filter(coloc) %>% select(snp, pheno, gene_id)
# Save all results to file
write_tsv(x=data_summary, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/coloc_asthma_ubclung.tsv")
# .rds files containing coloc.susie results with eQTLs
file_paths_ubclung = list.files(pattern="_all_susie.rds")
# Manipulate file names to get various bits of info
file_paths_ubclung %>%
as_tibble %>%
mutate(file = gsub(x=value, pattern="_all_susie.rds", replacement="")) %>%
separate(col=file,
into=c("pheno", "chr", "pos", "a1", "a2", "t1"),
sep="_",
remove=FALSE) -> file_split
# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
probe_1 = sapply(tmp, function(innerList) innerList[[length(innerList) - 1]])
probe_2 = sapply(tmp, tail, 1)
tissue_fct = function(df) {
x = paste0(df[6])
}
tissue = sapply(tmp, tissue_fct)
# Add gene and tissue to above dataframe
file_split %>%
mutate(probe_1 = probe_1,
probe_2 = probe_2,
tissue = tissue) %>%
unite("snp", chr:a2) %>%
select(file=value, pheno, snp, probe_1, probe_2, tissue) -> file_split
file_split$ProbeSet <- paste0(file_split$probe_1,"_",file_split$probe_2)
# Read all .rds files
data_list = lapply(file_paths_ubclung, readRDS)
# Function to extract coloc.susie information from results
summary_fct = function(df) {
return(df$summary)
}
# Collapse list of coloc results into single dataframe and add a few new columns
##Additional commands to work around summary with NULL data:
tmp_1 <- lapply(data_list, summary_fct)
nonnull <- sapply(tmp_1, typeof)!="NULL" # find all NULLs to omit
ID <- file_split$snp
index_row <- 1:length(tmp_1) #the length of tmp_1 which is the total nuber of all_susie.rds files
tmp_2 <- map2(tmp_1, ID, ~cbind(.x, snp = .y))
tmp_3 <- map2(tmp_2, index_row, ~cbind(.x, n_index = .y))
tmp_4 <- tmp_3[nonnull]
file_split$n_index <- index_row
data_summary_colocsusie = ldply(tmp_4) %>%
as_tibble %>%
left_join(file_split,by=c("snp","n_index")) %>%
select(-file) %>%
mutate(coloc_susie = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
arrange(desc(PP.H4.abf))
#map probset to gene:
data_summary_colocsusie <- left_join(data_summary_colocsusie, gene_probe,by="ProbeSet")
# Check if there are any colocalisations
data_summary_colocsusie %>% filter(coloc_susie) %>% select(snp, pheno, gene_id)
# Save all results to file
write_tsv(x=data_summary_colocsusie, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/colocsusie_asthma_ubclung.tsv")
#SAVE COLOC AND COLOC.SUSIE GENES INTO XLSX FILE:
#coloc.susie does not have any significant colocalisation:
gene_coloc <- as.data.frame(data_summary %>% filter(coloc) %>% select(gene_id))
gene_coloc_susie <- as.data.frame(data_summary_colocsusie %>% filter(coloc_susie) %>% select(gene_id))
print(gene_coloc)
#NO gene coloc_susie
write.table(gene_coloc,"/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/UBCLungeqtl_var2genes_raw.txt", row.names=FALSE, col.names=FALSE, quote=F)
#then add the genes in the /Var_to_Gene/input/var2genes_raw.xlsx file
src/merge_genes_eqtl_coloc.R
#!/usr/bin/env Rscript
#Rationale: merge genes from eQTL colocalisation analysis
library(tidyverse)
library(data.table)
library(readxl)
genes_raw <- "input/var2genes_raw.xlsx"
gtex <- read_excel(genes_raw, sheet = "GTExV8_eQTL_genes_symbol",col_names = c("ensembl","gene"))
eqtlgen <- read_excel(genes_raw, sheet = "eqtlGen_eQTL_genes",col_names = "gene")
ubclung <- read_excel(genes_raw, sheet = "UBClung_eQTL_genes",col_names = "gene")
gtex$ensembl <- NULL
eqtl_gene <- rbind(gtex,eqtlgen,ubclung) %>% unique()
eqtl_gene$evidence <- as.factor("eQTL")
write_tsv(x=eqtl_gene, file="input/eqtl_gene_merge")
Quantitative trait loci (QTL) allows us to measure a trait, such as
RNA expression of genes or protein levels, in a specific tissue and test
for association of genes within the interested tissue with genetic
variant. In other words, the expression trait is used as a quantitative
trait for a genome-wide analysis.
Looking at QTL and GWAS summary
statistics, we can use statistical methods -colocalisation- to assess if
both study reveal statistically significant association in the same loci
for the same variant. If so, we can then declare that the variant we
found from the GWAS show evidence of association with a gene in a
specific tissue.
Different types of QTL data are available, based
on the quantitative trait we study as well as at the type of
variant-gene relationship we want to look at. As an example, expression
QTL looks at mRNA expression; in addition, if the analysis is only for
variants near the gene’s transcription start site (usually +/-1Mb), this
QTL is called ‘cis’-eQTL; if the analysis is for variants further away
that can have an effect on gene expression via the 3D conformation on
DNA, they are called ‘trans’-eQTL. If, instead, we want to investigate
the role of the variant on the splicing of the mRNA, we call it ‘s’QTL
(cis or trans-sQTL). When analysing protein level, we identify the
measure as it ’p’QTL (methylation quantitative measure, ’m’QTL).
##
eQTL colocalisation I used cis-QTL data from three datasets: GTExV8,
eQTLGen blood, and UBC lung eQTL. I run the analysis if the cis-eQTL
showed significant association within the credible set regions, and I
included shared variants present in both eQTL and the GWAS summary
statistic within +/-500Kb from the associated variant. The
colocalisation was performed using coloc (https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1004383)
or coloc.susie (http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=34587156)
methods.
Here, a little description of each cis-eQTL data and the
used parameters.
GTExV8 (https://www.gtexportal.org/)
GTExV8 is a collection
of eQTL data for significant variant-gene association in 49 tissues with
at least 70 samples (https://www.gtexportal.org/home/methods). I run
co-localisation for the following tissues: ’Artery_Aorta’,
‘Artery_Coronary’, ‘Artery_Tibial’, ‘Colon_Sigmoid’, ‘Colon_Transverse’,
‘Esophagus_Gastroesophageal_Junction’, ‘Esophagus_Muscularis’, ‘Lung’,
‘Skin_Not_Sun_Exposed_Suprapubic’,
‘Skin_Sun_Exposed_Lower_leg’,‘Small_Intestine_Terminal_Ileum’,
‘Stomach’.
I used filtered GTExV8 data including variant-gene pairs
for each tissue with both GWAS and eQTL pvalues <= 5x10-6.
eQTLgen blood eQTLs (http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=34475573)
eQTLgen blood eQTLs is a source for 16,989 eQTL genes from 37
dataset and a total of 31,684 individuals, counting 11M SNPs with MAF
>=1% and within 1Mb from the analysed genes (https://www.eqtlgen.org/phase1.html). I used filtered
eQTLGen blood variant-gene pairs with both GWAS or eQTL pvalues <=
5x10-6.
UBC lung eQTL (http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=23209423)
UBC Lung eQTL is a collection of lung specimens from 1,111
individuals of European ancestry including 17,178 cis- and 593 trans-
lung eQTLs.
For UBC Lung, raw data were adjusted to find
significant p-value: eigenMT to correct for LD structure p-value, and
then benjamini hochberg correction (FDR) to take into account of
multiple testing.
EigenMT and FDR p-value correction were performed
by Jing Chen as for the multi-ancestry lung function paper; detailed
information on the analysis can be found in the main manuscript and
related supplementary information (https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-023-01314-0/MediaObjects/41588_2023_1314_MOESM1_ESM.pdf).
Jing provided me with the list of significant gene-variant
associations for UBC Lung eQTL after p-value correction. She also
provided me backbone scripts to run colocalisation with this eQTL.
Code
src/pQTL_coloc/000_preprocess_cs_b38.R
#!/bin/bash
#SBATCH --job-name=ukbpqtol_lookup
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=1:0:0
#SBATCH --mem=24gb
#SBATCH --account=gen1
#SBATCH --export=NONE
i=$((SLURM_ARRAY_TASK_ID))
#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"
THRESH=`bc <<< "scale=10; 0.05/1463"`
locus=('9_rs992969_5709697_6709697' '8_rs7824394_80792599_81792599' '6_rs148639908_90463614_91463614' '5_rs2188962_rs848_131270805_132496500' '5_rs1837253_rs3806932_109901872_110905675' '3_rs35570272_32547662_33547662' '2_rs6761047_242192858_243192858' '2_rs12470864_102426362_103426362' '17_17:38073838_CCG_C_37573838_38573838' '16_rs3024619_26864806_27864806' '15_rs17293632_66942596_67942596' '15_rs11071559_60569988_61569988' '12_rs73107993_47695873_48695873' '12_rs705705_55935504_56935504' '11_rs10160518_75796671_76796671' '10_rs201499805_rs1444789_8542744_9564361' '6_rs9271365_32086794_33086794')
CREDSET="${locus[i]}"
mkdir ${tmp_path}/ukb_pqtl/${CREDSET}
awk 'NR > 1 {print$0}' ${tmp_path}/ukb_pqtl/SA_${CREDSET}_b38 | while IFS='' read -r LINE || [ -n "${LINE}" ]
do
chr=`echo ${LINE} | awk '{print $3}'`
pos=`echo ${LINE} | awk '{print $11}'`
/data/gen1/UKBiobank/olink/pQTL/pqtl_lookup.sh -r ${chr}:${pos}-${pos} -p ${THRESH} \
> ${tmp_path}/ukb_pqtl/${CREDSET}/SA_${chr}_${pos}_ukb_protein_lookup.txt
done
##run job as:
#sbatch --array 0-16 src/pQTL_coloc/000_submit_lookup_ukbpqtl.sh
src/pQTL_coloc/000_submit_lookup_ukbpqtl.sh
#!/usr/bin/env Rscript
#Rationale: Create separate credible sets file for each genomic locus.
args = commandArgs(trailingOnly = TRUE)
cred_set = args[1]
tmp_path = args[2]
cred_set_b38 = args[3]
library(tidyverse)
library(data.table)
cs <- fread(cred_set)
b38 <- fread(cred_set_b38)
colnames(b38) <- c("chr_b38","pos_b38","pos1_b38")
cs_b38 <- cbind(cs,b38)
pheno <- unique(cs_b38$locus)
#split credset df into separate file for each locus:
##NB: allele2 is the effect allele, I change them because in /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat a1 is the effect allele, a2 is the non effect allele
for (i in pheno){
cs_b38_tmp <- cs_b38 %>% filter(locus == as.character(i))
locus <- as.character(unique(cs_b38_tmp$locus))
fwrite(cs_b38_tmp, paste0(tmp_path,"SA_",locus,"_b38"), quote=F, sep="\t")}
src/pQTL_coloc/001_combine_pqtl.awk
#!/usr/bin/awk -f
#Rationale: combine all the UKB-pQTL with significant variant-pQTL association after look-up analysis in severe asthma GWAS
#credible set variants.
BEGIN {
OFS = "\t"
}
NR == 1 {
print "LOCUS", $0
}
FNR > 1 {
sub("_ukb_protein_lookup.txt", "", FILENAME)
print FILENAME, $0
}
src/pQTL_coloc/000_submit_lookup_scallop.sh
#!/bin/bash
#SBATCH --job-name=scallop_pqtl_lookup
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=01:00:00
#SBATCH --mem=24gb
#SBATCH --account=gen1
#SBATCH --export=NONE
#Rationale: look-up in pQTL in SCALLOP pQTL - script from Chiara
PQTL_DATA="/data/gen1/reference/SCALLOP"
SNP_LIST="/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt"
PQTL_PATH="/scratch/gen1/nnp5/Var_to_Gen_tmp/scallop_pqtl"
## List of variants with columns: ID, CHROM, POS
cat ${SNP_LIST} | awk '{print $3"_"$4"_"$5"_"$6,$3,$4}' > ${PQTL_PATH}/snps_list.txt
# Set up log file
touch ${PQTL_PATH}/log_pQTL_SCALLOP_analysis
echo -ne "FILENAME\tPROTEIN\tN_LOOKUP\tN_NOMINAL\tN_SIG" > ${PQTL_PATH}/log_pQTL_SCALLOP_analysis
echo >> ${PQTL_PATH}/log_pQTL_SCALLOP_analysis
# Perform look-up using Nick's awk script
for filename in ${PQTL_DATA}/*.txt.gz
do
## Set file name
name=$(basename ${filename} .txt.gz)
## Header for output
zcat ${filename} | head -1 > ${PQTL_PATH}/${name}.txt
## Perform look-up
/home/n/nnp5/PhD/PhD_project/Var_to_Gene/src/pQTL_coloc/000_scallop_lookup.awk \
${PQTL_PATH}/snps_list.txt <(zcat $filename) > ${PQTL_PATH}/${name}.txt
## Report look-up statistics in log file
N_total=`cat ${PQTL_PATH}/${name}.txt | tail -n +2 -q | wc -l`
N_nominal=`cat ${PQTL_PATH}/${name}.txt | awk '$8 < 0.05 {print $0}' | wc -l`
N_sig=`cat ${PQTL_PATH}/${name}.txt | awk '$8 < 5E-8 {print $0}' | wc -l`
echo -ne "${name}.txt\t${name}\t${N_total}\t${N_nominal}\t${N_sig}" >> ${PQTL_PATH}/log_pQTL_SCALLOP_analysis
echo >> ${PQTL_PATH}/log_pQTL_SCALLOP_analysis
done
src/pQTL_coloc/000_scallop_lookup.awk
#!/usr/bin/awk -f
# For each line of the first input file (the proxies) add a chr:pos key to the proxies array
# The array keys are unique so there won't be any duplicates even if you
# add the same chr, pos twice.
#
# NR = line number over all input files
# FNR = line number in current file
# Hence when NR == FNR you must be in the first input file
# "next" means go to the next line in the file without processing any subsequent rules
NR == FNR {
proxies[$2][$3] = 1
next
}
# For the second input file (the pQTL results) output the header
# (processing will only get this far if the rule above didn't match)
FNR == 1 {
print
}
# For subsequent lines in the the second file
FNR > 1 {
# Store a copy of the original line to output
line = $0
# Substitute ":" to split chr, pos into fields for matching
gsub(":", "\t", $0)
}
# Print the original line if chr, pos is in our array of proxies
proxies[$1][$2] {
print line
}
src/pQTL_coloc/000_submit_lookup_decode.sh
#!/bin/bash
#SBATCH --job-name=decode_pqtl_lookup
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=72:00:00
#SBATCH --mem=70gb
#SBATCH --account=gen1
#SBATCH --export=NONE
#Rationale: look-up in pQTL in deCODE pQTL - script from Chiara
ukbb38_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/ukb_pqtl"
PQTL_DATA="/data/gen1/pQTL/Ferkingstad_2021"
PQTL_PATH="/scratch/gen1/nnp5/Var_to_Gen_tmp/decode_pqtl"
#USE b38 POSITION:
##NOTE: DEPENDENCIES ON SCRIPT src/pQTL_coloc/000_preprocess_cs_b38.R TO CREATE B38 FILES:
## List of variants with columns: ID (chr_posb37_a1_a2), CHROM, POS (pos b38):
awk 'NR >1 {print $3"_"$4"_"$5"_"$6, "chr"$3, $11}' \
${ukbb38_path}/SA_*_b38 | grep -v "^chromosome" > ${PQTL_PATH}/snps_list.txt
# Set up log file
touch ${PQTL_PATH}/log_pQTL_decode_analysis
echo -ne "FILENAME\tPROTEIN\tN_LOOKUP\tN_NOMINAL\tN_SIG" > ${PQTL_PATH}/log_pQTL_decode_analysis
echo >> ${PQTL_PATH}/log_pQTL_decode_analysis
# Perform look-up using Nick's awk script
for filename in ${PQTL_DATA}/*.txt.gz
do
## Set file name
name=$(basename ${filename} .txt.gz)
## Header for output
zcat ${filename} | head -1 > ${PQTL_PATH}/${name}.txt
## Perform look-up
## Perform look-up
/home/n/nnp5/PhD/PhD_project/Var_to_Gene/src/pQTL_coloc/000_decode_lookup.awk \
${PQTL_PATH}/snps_list.txt <(zcat $filename) > ${PQTL_PATH}/${name}.txt
## Report look-up statistics in log file
N_total=`cat ${PQTL_PATH}/${name}.txt | tail -n +2 -q | wc -l`
N_nominal=`cat ${PQTL_PATH}/${name}.txt | awk '$10 < 0.05 {print $0}' | wc -l`
N_sig=`cat ${PQTL_PATH}/${name}.txt | awk '$10 < 1.8E-9 {print $0}' | wc -l`
echo -ne "${name}.txt\t${name}\t${N_total}\t${N_nominal}\t${N_sig}" >> ${PQTL_PATH}/log_pQTL_decode_analysis
echo >> ${PQTL_PATH}/log_pQTL_decode_analysis
done
src/pQTL_coloc/000_decode_lookup.awk
#!/usr/bin/awk -f
# For each line of the first input file (the proxies) add a chr:pos key to the proxies array
# The array keys are unique so there won't be any duplicates even if you
# add the same chr, pos twice.
#
# NR = line number over all input files
# FNR = line number in current file
# Hence when NR == FNR you must be in the first input file
# "next" means go to the next line in the file without processing any subsequent rules
NR == FNR {
proxies[$2][$3] = 1
next
}
# For the second input file (the pQTL results) output the header
# (processing will only get this far if the rule above didn't match)
FNR == 1 {
print
}
# For subsequent lines in the the second file
FNR > 1 {
# Store a copy of the original line to output
line = $0
# Substitute ":" to split chr, pos into fields for matching
gsub(":", "\t", $0)
}
# Print the original line if chr, pos is in our array of proxies
proxies[$1][$2] {
print line
}
I decided to perform a look-up of the 615 credible set variants in
three pQTL sources: deCODE plasma, SCALLOP, and UKBiobank pQTL data.
Significant
deCODE plasma pQTL (https://pubmed.ncbi.nlm.nih.gov/34857953/) deCODE is a
free collection of protein plasma pQTL data for 4,907 proteins and 27.2
million tested variants. They used Bonferroni corrected p-value for
multiple testing (0.05/27,200,000=1.8x10−9). They included variants with
a MAF > 0.01% and imputation score > 0.9. If the associated
variant was within 1Mb from the transcription start site of the gene,
the pQTL was defined as cis, if outside this threshold as trans.
SCALLOP pQTL (https://pubmed.ncbi.nlm.nih.gov/33067605/)
SCALLOP
(Systematic and Combined Analysis of Olink Proteins) pQTL combines
measures from 90 circulating cardiovascular proteins in about 30,000
individuals from 13 cohorts.
UK Biobank
pQTL (https://www.nature.com/articles/s41586-023-06592-6) The
Pharma Proteomics Project led to the analysis of 2,923 plasma proteins
in 54,219 UK Biobank participants overall. Within our group, the pQTL
association was performed again in 48,195 European samples (as defined
in Shrine et al. 2023) and for 1,463 proteins, including 44.8 million
MAC >= 5 variants.
Variant-protein associations were defined as
significant if pQTL p-value was less or equal to 1.8E-9 for deCODE, 5E-8
for SCALLOP, and 3.41E-5 (0.05/1,463) for UK Biobank pQTL.
Code src/PoPS/PoPS.sh
#!/usr/bin/env bash
#Rationale: run PoPS with severe asthma summary statistics
#Based on scripts and log files found in: /data/gen1/TSH/PoPS
#PoPS v0.1: https://github.com/FinucaneLab/pops/tree/add-license-1
#software stored in /data/gen1/LF_HRC_transethnic/PoPS/pops
module load R
#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"
#Download MAGMA and PoPS:
cd /home/n/nnp5/software
wget https://ctg.cncr.nl/software/MAGMA/prog/magma_v1.10.zip
unzip https://ctg.cncr.nl/software/MAGMA/prog/magma_v1.10.zip
cd /home/n/nnp5/PhD/PhD_project/Var_to_Gene
#Step 0: Generate gene-level association statistics as by MAGMA to have genes.out and genes.raw files:
##bfile and gene-annot: already downloaded with respect to TSH/lung function analysis - use the same.
## pval: sumstats file from severe asthma GWAS as this:
#SNP p N
#rs147324274 0.9139 124358
#rs369318156 0.4323 124358
#Create the .sumstats file:
awk '{print $1, $9, $3=46086}' \
/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat | \
sed 's/snpid pval 46086/SNP p N/g' > ${tmp_path}/pops/SA.sumstat
/home/n/nnp5/software/magma \
--bfile /data/gen1/LF_HRC_transethnic/PoPS/data/1000G.EUR \
--gene-annot /data/gen1/LF_HRC_transethnic/PoPS/data/magma_0kb.genes.annot \
--pval ${tmp_path}/pops/SA.sumstat ncol=N \
--gene-model snp-wise=mean \
--out ${tmp_path}/pops/SA
#Step 1: Select features
#the features are in: /data/gen1/LF_HRC_transethnic/PoPS/data/PoPS.features.txt.gz
sbatch src/PoPS/submit_pops.sh
src/PoPS/submit_pops.sh
#!/bin/bash
#SBATCH --job-name=SA_POPS
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=60:00:00
#SBATCH --mem=50gb
#SBATCH --account=gen1
#SBATCH --export=NONE
#Rationale: PoPS feature selection and score calculation
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp"
popsdir="/data/gen1/LF_HRC_transethnic/PoPS"
module load python3
python ${popsdir}/pops/pops.feature_selection.py \
--features ${popsdir}/data/PoPS.features.txt.gz \
--gene_results ${tmp_path}/pops/SA \
--out ${tmp_path}/pops/SA \
for i in {1..22}
do
python ${popsdir}/pops/pops.predict_scores.py \
--gene_loc ${popsdir}/data/gene_loc.txt \
--gene_results ${tmp_path}/pops/SA \
--features ${popsdir}/data/PoPS.features.txt.gz \
--selected_features ${tmp_path}/pops/SA.features \
--control_features ${popsdir}/data/control.features \
--chromosome ${i} \
--out ${tmp_path}/pops/SA
done
src/PoPS/PoPS_summary.R
#!/usr/bin/env Rscript
#Rationale: Look at PoPS results and create a summary of them
##We currently suggest taking the highest scoring gene in each GWAS locus.
##You could further filter this set to only include genes in top 10% of PoP scores across all genes.
##Negative scores generally mean low evidence -- this is the predicted MAGMA z-score!!’ (https://github.com/FinucaneLab/pops/issues/4)
##From TSH paper: ‘prioritized genes for all autosomal TSH sentinel variants within a 500kb (±250kb) window of
#the sentinel variant and reported the top prioritised genes in the region.
#If there was no gene prioritized within a 500kb window of the sentinel,
#we reported any top prioritized genes within a 1Mb window (Supplementary Data 19).’.
library(data.table)
library(tidyverse)
sig_list <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt")
setnames(sig_list,"chromosome","chr")
setnames(sig_list,"position","pos")
sig_list$sentinel <- paste0(sig_list$chr,"_",sig_list$pos,"_",sig_list$allele1,"_",sig_list$allele2)
locus <- unique(sig_list$locus)
gene_loc <- fread("/data/gen1/LF_HRC_transethnic/PoPS/data/gene_loc.txt")
gene_features <- fread("/data/gen1/LF_HRC_transethnic/PoPS/data/PoPS.features.txt.gz")
##### sentinel 500 kb total window #####
result <- data.frame(matrix(ncol = 16,nrow = 0))
for(i in locus){
locus_sig_list <- sig_list %>% filter(locus == as.character(i))
sentinel <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(sentinel)
chr <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(chr)
pos <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(pos)
print(sentinel)
trait <- "SA"
if(chr!="X"){
chr <- as.integer(chr)
pos <- as.integer(pos)
pos1 <- pos-250000 # window set
pos2 <- pos+250000 # window set
if(pos1<0){
pos1 <- 0
}
genes <- gene_loc[gene_loc$CHR==chr,]
genes <- genes[(genes$START>=pos1&genes$START<=pos2)|(genes$END>=pos1&genes$END<=pos2),]
n_genes <- nrow(genes)
if(n_genes!=0){
result1 <- data.frame(matrix(ncol = 16,nrow = 0))
for(j in 1:n_genes){
result2 <- data.frame(matrix(ncol = 16,nrow = 0))
gene <- genes$ENSGID[j]
result2[1,1] <- gene
result2[1,2] <- sentinel
PoPS <- fread(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/",trait,".",chr,".results"))
score <- PoPS[PoPS$ENSGID==gene,]$Score[1]
result2[1,3] <- score
result2[1,4] <- i
beta <- fread(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/",trait,".",chr,".coefs"))
X_all <- gene_features[ENSGID==gene,]
X <- select(X_all,beta$Feature)
X <- as.data.table(t(X))
beta$X <- X$V1
beta <- beta[,contribute:=X*beta_hat]
beta <- beta[order(abs(contribute),decreasing=TRUE)]
result2[1,7] <- beta$Feature[1]
result2[1,8] <- beta$Feature[2]
result2[1,9] <- beta$Feature[3]
result2[1,10] <- beta$Feature[4]
result2[1,11] <- beta$Feature[5]
result2[1,12] <- beta$Feature[6]
result2[1,13] <- beta$Feature[7]
result2[1,14] <- beta$Feature[8]
result2[1,15] <- beta$Feature[9]
result2[1,16] <- beta$Feature[10]
result1 <<- rbind(result1,result2)
}
result1 <- as.data.table(result1)
result1 <- result1[order(X3,decreasing=TRUE)]
result1$X6 <- FALSE
result1$X6[1] <- TRUE
for(j in 1:n_genes){
result1$X5[j] <- j
}
result <<- rbind(result,result1)
}
}
}
names(result) <- c("ENSGID","sentinel","PoPS_score","signal_id","gene_rank","prioritized","Feature1","Feature2","Feature3","Feature4","Feature5","Feature6","Feature7","Feature8","Feature9","Feature10")
result <- result[order(signal_id,gene_rank)]
write.table(result,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_250kb_window.txt", row.names=F, quote=F, sep="\t")
#save results for only prioritised genes:
write.table(result[gene_rank==1,],"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/prioritized_genes_250kb_window.txt", row.names=F, quote=F, sep="\t")
#save prioritised genes only:
genes_250 <- unique(result %>% filter(gene_rank == 1) %>% select(ENSGID))
write.table(genes_250,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/pops_var2genes_raw_250kbwindow.txt", row.names=F, quote=F, sep="\t", col.names=F)
#two locus not present when looking at 500kb total window:
#10_rs201499805_rs1444789_8542744_9564361 and 15_rs11071559_60569988_61569988
##### sentinel 1 mb total window #####
result <- data.frame(matrix(ncol = 16,nrow = 0))
for(i in locus){
locus_sig_list <- sig_list %>% filter(locus == as.character(i))
sentinel <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(sentinel)
chr <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(chr)
pos <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(pos)
print(sentinel)
trait <- "SA"
if(chr!="X"){
chr <- as.integer(chr)
pos <- as.integer(pos)
pos1 <- pos-500000 # window set
pos2 <- pos+500000 # window set
if(pos1<0){
pos1 <- 0
}
genes <- gene_loc[gene_loc$CHR==chr,]
genes <- genes[(genes$START>=pos1&genes$START<=pos2)|(genes$END>=pos1&genes$END<=pos2),]
n_genes <- nrow(genes)
if(n_genes!=0){
result1 <- data.frame(matrix(ncol = 16,nrow = 0))
for(j in 1:n_genes){
result2 <- data.frame(matrix(ncol = 16,nrow = 0))
gene <- genes$ENSGID[j]
result2[1,1] <- gene
result2[1,2] <- sentinel
PoPS <- fread(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/",trait,".",chr,".results"))
score <- PoPS[PoPS$ENSGID==gene,]$Score[1]
result2[1,3] <- score
result2[1,4] <- i
beta <- fread(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/",trait,".",chr,".coefs"))
X_all <- gene_features[ENSGID==gene,]
X <- select(X_all,beta$Feature)
X <- as.data.table(t(X))
beta$X <- X$V1
beta <- beta[,contribute:=X*beta_hat]
beta <- beta[order(abs(contribute),decreasing=TRUE)]
result2[1,7] <- beta$Feature[1]
result2[1,8] <- beta$Feature[2]
result2[1,9] <- beta$Feature[3]
result2[1,10] <- beta$Feature[4]
result2[1,11] <- beta$Feature[5]
result2[1,12] <- beta$Feature[6]
result2[1,13] <- beta$Feature[7]
result2[1,14] <- beta$Feature[8]
result2[1,15] <- beta$Feature[9]
result2[1,16] <- beta$Feature[10]
result1 <<- rbind(result1,result2)
}
result1 <- as.data.table(result1)
result1 <- result1[order(X3,decreasing=TRUE)]
result1$X6 <- FALSE
result1$X6[1] <- TRUE
for(j in 1:n_genes){
result1$X5[j] <- j
}
result <<- rbind(result,result1)
}
}
}
names(result) <- c("ENSGID","sentinel","PoPS_score","signal_id","gene_rank","prioritized","Feature1","Feature2","Feature3","Feature4","Feature5","Feature6","Feature7","Feature8","Feature9","Feature10")
result <- result[order(signal_id,gene_rank)]
write.table(result,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_500kb_window.txt", row.names=F, quote=F, sep="\t")
write.table(result[gene_rank==1,],"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/prioritized_genes_500kb_window.txt", row.names=F, quote=F, sep="\t")
#save prioritised genes only:
genes_500 <- unique(result %>% filter(gene_rank == 1) %>% select(ENSGID))
write.table(genes_500,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/pops_var2genes_raw_500kbwindow.txt", row.names=F, quote=F, sep="\t", col.names=F)
#locus 15_rs11071559_60569988_61569988 reveals a gene !
################# mapping features ###############
# Get gene information using biomaRt for all genes
library(biomaRt) # Requires R/4.1.0
#250Kb window results
pops <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/prioritized_genes_250kb_window.txt")
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl", version = "GRCh37")
genes <- as_tibble(getBM(attributes=c('hgnc_symbol', 'ensembl_gene_id', 'strand'),
filters=c('ensembl_gene_id'),
values=list(sort(unique(pops$ENSGID))),
mart=ensembl)) %>%
rename(gene_symbol = hgnc_symbol, gene_strand = strand)
genes <- as.data.table(genes)
setnames(genes,"ensembl_gene_id","ENSGID")
merged <- merge(pops,genes,by="ENSGID")
write.table(merged,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_250kb_window_gene_mapped.txt", row.names=F, quote=F, sep="\t")
#500Kb window results
pops <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/prioritized_genes_500kb_window.txt")
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl", version = "GRCh37")
genes <- as_tibble(getBM(attributes=c('hgnc_symbol', 'ensembl_gene_id', 'strand'),
filters=c('ensembl_gene_id'),
values=list(sort(unique(pops$ENSGID))),
mart=ensembl)) %>%
rename(gene_symbol = hgnc_symbol, gene_strand = strand)
genes <- as.data.table(genes)
setnames(genes,"ensembl_gene_id","ENSGID")
merged <- merge(pops,genes,by="ENSGID")
write.table(merged,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_500kb_window_gene_mapped.txt", row.names=F, quote=F, sep="\t")
detach("package:biomaRt", unload=TRUE)
##########################################################
w250 <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_250kb_window_gene_mapped.txt") %>%
mutate(window="250kb")
w500 <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_500kb_window_gene_mapped.txt") %>%
mutate(window="500kb")
w500_use <- w500[!sentinel%in%w250$sentinel,]
merged <- rbind(w250,w500_use)
merged <- merged[PoPS_score>0,]
write.table(merged,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_results_merged_table.txt", row.names=F, quote=F, sep="\t")
#save final list of genes:
genes <- unique(merged %>% select(gene_symbol))
write.table(genes,"/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/pops_var2genes_raw.txt", row.names=F, quote=F, sep="\t")
PoPS is a gene prioritisation analysis that uses genome-wide signals
from GWAS summary statistics and identified prioritised genes looking
into public bulk and single-cell expression datasets, curated biological
pathways, and predicted protein-protein interactions (https://github.com/FinucaneLab/pops; https://www.nature.com/articles/s41588-023-01443-6).
PoPS includes three steps:
Step 0: Generate MAGMA scores: run MAGMA to obtain gene association statistics (z-scores) using severe asthma GWAS sumstats and 1000G European individuals;
Step 1: Select features: looking at the MAGMA scores, certain features are selected since they are enriched, and stored in the .feature output files;
Step 2: Run PoPS: calculate the POP score for each gene using a joint model with a leave-one chromosome out method for the enrichment of all selected features
For each variant, I analysed all the genes within 500Kb and reported the top prioritised genes as by PoPS; if no prioritised genes, I enlarged the genomic window to 1MB.
src/rare_disease/rare_disease.r
#!/usr/bin/env Rscript
#Rationale: Find nearby rare mendelian genes that are within +/- 500Kb from SA GWAS sentinels.
library(GenomicRanges)
library(rtracklayer)
library(tidyverse)
library(pander)
library(scales)
library(readxl)
library(data.table)
# Define distance to look up nearby rare disease associated genes
DISTANCE <- 5e5
# funtion to work out the number unique element of the yth column of data table x
getN <- function(x, y) x %>% pull({{y}}) %>% n_distinct
## retrieve genomic reference for genes
ucsc = browserSession("UCSC") # methods for getting browser sessions
genome(ucsc) <- "hg19" # setting the sequence information stored in ucsc
# genomic reference for genes
refseq <- ucsc %>%
ucscTableQuery(track="NCBI RefSeq", table="refGene") %>%
getTable %>%
as_tibble
## genes and rare disease association from orphadata
#https://www.orphadata.com/data/xml/en_product6.xml
orphanet_genes <- read_xlsx("/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/en_product6.xlsx") %>%
select(Symbol, Gene=Name12, OrphaCode, Disease=Name, SourceOfValidation,
DisorderGeneAssociationType=Name24) %>%
distinct
# hpo: human phenotype ontology provides a standardized vocabulary of phenotypic abnormalities encountered in human disease
#https://www.orphadata.org/data/xml/en_product4.xml
#create the levels for HPOFrequency column:
hpo_f_levels <- c("Obligate (100%)","Very frequent (99-80%)","Frequent (79-30%)","Occasional (29-5%)","Very rare (<4-1%)","Excluded (0%)")
orphanet_hpo <- read_xlsx("/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/en_product4.xlsx") %>%
select(OrphaCode, HPOId, HPOTerm, HPOFrequency=Name15) %>%
distinct %>%
mutate(HPOFrequency=factor(HPOFrequency, levels=hpo_f_levels))
##Retrieve Sentinel SNPs - highest PIP in the fine-mapped loci:
sig_list_tmp <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt")
setnames(sig_list_tmp,"chromosome","chr")
setnames(sig_list_tmp,"position","pos")
sig_list_tmp$sentinel <- paste0(sig_list_tmp$chr,"_",sig_list_tmp$pos,"_",sig_list_tmp$allele1,"_",sig_list_tmp$allele2)
locus <- unique(sig_list_tmp$locus)
sentinels <- data.frame(matrix(ncol = 4,nrow = 0))
colnames(sentinels) <- c("locus","sentinel","chr","pos")
for(i in locus){
locus_sig_list <- sig_list_tmp %>% filter(locus == as.character(i))
locus_sig_list <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(locus,sentinel,chr,pos)
sentinels <- rbind(sentinels,locus_sig_list)
}
fwrite(sentinels, "input/highest_PIP_sentinels",row.names=F,quote=F,sep="\t") #save this for easiness to retrieve them.
## Overlap of GWAS function signals with NCBI Refseq genes
# Create a GRanges object for reference of genes
refseq.GRanges <- refseq %>%
mutate(chrom=sub("^chr", "", chrom)) %>%
select(Symbol=name2, chrom, start=txStart, end=txEnd) %>%
makeGRangesFromDataFrame(keep.extra.columns = TRUE) # takes a data-frame-like object as input and tries to automatically find the columns that describe genomic ranges. It returns them as a GRanges object
# Create a GRanges object for our signals
sentinels.GRanges <- sentinels %>%
select(chr, start=pos, sentinel) %>%
makeGRangesFromDataFrame(end.field = "start", keep.extra.columns = TRUE)
# overlap checking
nearby_genes <- mergeByOverlaps(sentinels.GRanges, refseq.GRanges, maxgap=DISTANCE) %>%
as_tibble %>%
select(sentinel, Position=sentinels.GRanges.start, Symbol,
txStart=refseq.GRanges.start, txEnd=refseq.GRanges.end,
width=refseq.GRanges.width) %>%
mutate(distance=ifelse(txStart <= Position & txEnd >= Position, 0,
ifelse(Position < txStart, txStart - Position, Position - txEnd))) %>%
group_by(sentinel, Symbol) %>% # most data operation are done one groups defined by variables
filter(distance == min(distance)) %>%
filter(width == max(width)) %>%
slice(1) %>%
ungroup
n_refseq <- refseq %>% getN(name2)
n_nearby_genes_Sentinel <- nearby_genes %>% getN(sentinel)
n_nearby_genes_Symbol <- nearby_genes %>% getN(Symbol)
n_genes <- orphanet_genes %>% getN(Symbol)
n_disease <- orphanet_genes %>% getN(OrphaCode)
n_disease_with_hpo <- orphanet_hpo %>% getN(OrphaCode)
n_hpo <- orphanet_hpo %>% getN(HPOId)
# Merge Orphanet genes with HPO terms and keep only Disease-causing germline genes
orphanet <- left_join(orphanet_genes, orphanet_hpo) %>%
filter(grepl("Disease-causing germline", DisorderGeneAssociationType)) %>%
mutate(HPOFrequency = fct_explicit_na(HPOFrequency, "No HPO term"))
n_gene_disease_c <- orphanet %>% group_by(Symbol, Disease) %>% n_groups
n_genes_c <- orphanet %>% getN(Symbol)
n_disease_c <- orphanet %>% getN(Disease)
n_disease_hpo_c <- orphanet %>% group_by(Disease, HPOId) %>% n_groups
n_disease_with_hpo_c <- orphanet %>% filter(!is.na(HPOId)) %>% getN(Disease)
n_hpo_c <- orphanet %>% getN(HPOId)
orphanet %>%
count(HPOFrequency) %>%
mutate(pc=(n/sum(n)) %>% percent) %>%
pander(big.mark=",", caption=paste("Table 1a: Distribution of HPO term frequencies across",
nrow(orphanet) %>% comma, "rows of gene-disease-hpo associations for Disease-causing germline genes"))
orphanet %>%
group_by(Symbol) %>%
arrange(HPOFrequency) %>%
slice(1) %>%
ungroup %>%
count(HPOFrequency) %>%
mutate(pc=(n/sum(n)) %>% percent) %>%
pander(big.mark=",", caption=paste("Table 1b: Distribution of highest HPO term frequency per gene across",
n_genes_c %>% comma, "Disease-causing germline genes"))
## Filtering for asthma terms
red <- hpo_f_levels[1:2]
yellow <- hpo_f_levels[3]
green <- hpo_f_levels[4:5]
#HPOId term for asthma: HP:0002099
key_terms <- c("asthma","eosin","immunodef","cili","autoimm","leukopenia","neutropenia","macroph")
orphanet <- orphanet %>%
mutate(asthma_HPO=grepl(paste(key_terms, collapse='|'), HPOTerm, ignore.case = TRUE) &
HPOFrequency != "Excluded (0%)",
asthma_disease=grepl(paste(key_terms, collapse='|'), Disease, ignore.case = TRUE),
asthma=(asthma_HPO | asthma_disease),
overlap=Symbol %in% nearby_genes$Symbol,
evidence=ifelse(!asthma, "Not asthma related",
ifelse(asthma_disease | HPOFrequency %in% red,
"Disease name, obligate or very frequent",
ifelse(HPOFrequency %in% yellow, "Frequent",
ifelse(HPOFrequency %in% green, "Occasional or rare", "Excluded")))) %>%
factor(levels = c("Disease name, obligate or very frequent",
"Frequent",
"Occasional or rare",
"Excluded",
"Not asthma related")))
n_asthma_genes_disease <- orphanet %>% filter(asthma_disease) %>% getN(Symbol)
n_asthma_genes_hpo <- orphanet %>% filter(asthma_HPO) %>% getN(Symbol)
orphanet_asthma <- orphanet %>%
filter(asthma)
n_asthma_genes <- orphanet_asthma %>% getN(Symbol)
orphanet_asthma_selected <- orphanet_asthma %>%
filter(overlap) %>%
select(-overlap)
n_asthma_genes_selected <- orphanet_asthma_selected %>% getN(Symbol)
table2a <- orphanet %>%
group_by(Symbol) %>%
summarise_at(vars(asthma, overlap), any) %>%
ungroup %>%
count(asthma, overlap) %>%
pivot_wider(names_from = "overlap", values_from = "n", names_prefix = "overlap_") %>%
mutate_at(vars(starts_with("overlap")), list(pc=~(./sum(.)) %>% percent))
table2a_chisq <- table2a %>% select(2:3) %>% chisq.test
table2b <- orphanet %>%
group_by(Symbol) %>%
arrange(evidence) %>%
slice(1) %>%
ungroup %>%
count(evidence, overlap) %>%
pivot_wider(names_from = "overlap", values_from = "n", names_prefix = "overlap_",
values_fill = list(n=0)) %>%
mutate_at(vars(starts_with("overlap")), list(pc=~(./sum(.)) %>% percent))
table2b_chisq <- table2b %>% select(2:3) %>% chisq.test
pander(table2a, big.mark=",",
caption=paste0("Table 2a: For", n_genes_c %>% comma, "rare Disease-causing, germline genes, the numbers of asthma and non-asthma related genes vs. numbers overlapping severe asthma signals ", DISTANCE/1000, "kb. chi^2 P =", table2a_chisq$p.value %>% round(3)))
pander(table2b, big.mark=",",
caption=paste0("Table 2b: For", n_genes_c %>% comma, "rare Disease-causing, germline genes, the numbers of genes in each asthma evidence category vs. numbers overlapping severe asthma signals ", DISTANCE/1000, "kb. chi^2 P =", table2b_chisq$p.value %>% round(3)), split.table=Inf)
## Hypergeometric test
hitInSample <- n_asthma_genes_selected
hitInPop <- n_asthma_genes
failInPop <- n_refseq - n_asthma_genes
sampleSize <- n_nearby_genes_Symbol
p_asthma_en <- phyper(q=hitInSample - 1,
m=hitInPop,
n=failInPop,
k=sampleSize,
lower.tail = FALSE) # P value for hypergeometric test: 0.2486454
table3a.m <- matrix(c(hitInSample, hitInPop - hitInSample,
sampleSize - hitInSample, failInPop - sampleSize + hitInSample), 2, 2)
rownames(table3a.m) <- c("yes", "no")
colnames(table3a.m) <- c("asthma", "others")
table3a <- table3a.m %>%
as_tibble(rownames = "Near severe asthma sentinel") %>%
mutate(pc_asthma=(asthma/(asthma + others)) %>% percent)
pander(table3a, big.mark=",", caption=paste0("Table 3: For rare disease-causing genes, the proportion that are asthma related within ", DISTANCE/1000, "kb of a severe asthma sentinel."))
## By-gene and by-SNP results
concat <- function(x) {
x <- x[!is.na(x)]
if (length(x) > 0 & is.numeric(x)) {
min(x, na.rm=T) %>% paste
} else {
x %>% unique %>% paste(collapse="; ")
}
}
orphanet_asthma_selected_wide <- orphanet_asthma_selected %>%
mutate_at(vars(starts_with("HPO")), ~ifelse(.data$asthma_HPO, ., NA)) %>%
arrange(evidence) %>%
group_by(Symbol, Gene) %>%
summarise(Evidence=evidence[1],
nDisease=n_distinct(Disease, na.rm = TRUE),
Diseases=concat(Disease),
Validation=concat(SourceOfValidation),
nHPOTerm=n_distinct(HPOTerm, na.rm = TRUE),
HPOTerms=concat(HPOTerm)) %>%
ungroup %>%
arrange(Evidence)
implicated_rare <- right_join(nearby_genes, orphanet_asthma_selected_wide) %>%
add_column(Rare="Rare", .before = "txStart")
results <- implicated_rare %>% select(-Position) %>% left_join(sentinels, .)
results_rare <- results %>% filter(!is.na(Rare)) %>% arrange(Evidence, distance, chr, pos) %>% ungroup
results_by_snp <- results %>%
arrange(Evidence, distance) %>%
mutate_at(vars(Rare), ~ifelse(!is.na(.), .data$Symbol, NA)) %>%
group_by(sentinel) %>%
summarise_at(vars(Rare, Evidence, distance), concat) %>%
ungroup %>%
left_join(sentinels, .)
results_by_snp_phyper <- results_rare %>%
group_by(sentinel) %>%
summarise(hitInSample_snp=n()) %>%
ungroup %>%
left_join(nearby_genes %>%
group_by(sentinel) %>%
summarise(sampleSize_snp=n())) %>%
mutate(hyperG_p=phyper(q=hitInSample_snp - 1,
m=hitInPop,
n=failInPop,
k=sampleSize_snp,
lower.tail = FALSE)) %>%
left_join(results_by_snp, .)
results_by_gene <- results %>%
filter(!is.na(Symbol)) %>%
arrange(Evidence, distance) %>%
mutate_at(vars(Rare), ~ifelse(!is.na(.), .data$sentinel, NA)) %>%
group_by(Symbol) %>%
summarise_at(vars(Rare, Evidence, distance, Diseases, HPOTerms), concat)
out_base <- paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/rare_disease/results_", DISTANCE/1000, "kb")
write_csv(results, paste0(out_base, ".csv"), na = "")
write_csv(results_rare, paste0(out_base, "_rare.csv"), na = "")
write_csv(results_by_snp_phyper, paste0(out_base, "_by_snp.csv"), na = "")
write_csv(results_by_gene, paste0(out_base, "_by_gene.csv"), na = "")
#save genes only:
fwrite(as.data.frame(results_by_gene$Symbol), "/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/rare_disease_genes_raw.txt", na = "",col.names=F,quote=F)
Using the ORPHANET resource (https://www.orpha.net/), I analysed if any genes within 500Kb the top causal variant for each fine mapped locus was associated with rare diseases showing implication in asthma. In doing so, I filtered the Human Phenotype Ontology (HPO) term or any disease including key terms ‘asthma’/‘eosin’/‘immunodef’/‘cili’/‘autoimm’/‘leukopenia’/‘neutropenia’/‘macroph’.
src/mouse_ko/mouse_ko.r
#!/usr/bin/env Rscript
#Rationale: Find nearby human ortholog of mouse knockout genes that are within +/- 500Kb from SA GWAS sentinels.
library(GenomicRanges)
library(rtracklayer)
library(tidyverse)
library(pander)
library(scales)
library(readxl)
library(data.table)
setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/mouse_ko/")
DISTANCE <- 5e5
getN <- function(x, y) x %>% pull({{y}}) %>% n_distinct
##Retrieve Sentinel SNPs - highest PIP in the fine-mapped loci:
sig_list_tmp <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt")
setnames(sig_list_tmp,"chromosome","chr")
setnames(sig_list_tmp,"position","pos")
sig_list_tmp$sentinel <- paste0(sig_list_tmp$chr,"_",sig_list_tmp$pos,"_",sig_list_tmp$allele1,"_",sig_list_tmp$allele2)
locus <- unique(sig_list_tmp$locus)
sentinels <- data.frame(matrix(ncol = 4,nrow = 0))
colnames(sentinels) <- c("locus","sentinel","chr","pos")
for(i in locus){
locus_sig_list <- sig_list_tmp %>% filter(locus == as.character(i))
locus_sig_list <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(locus,sentinel,chr,pos)
sentinels <- rbind(sentinels,locus_sig_list)
}
## retrieve genomic reference for genes
#resolved problem:
#if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install(version = "3.18")
ucsc <- browserSession("UCSC")
genome(ucsc) <- "hg19"
refseq <- ucsc %>%
ucscTableQuery(track="NCBI RefSeq", table="refGene") %>%
getTable %>%
as_tibble
## Mouse genotype-phenotype data
## Store in /scratch/gen1/nnp5/Var_to_Gen_tmp/mouse_ko/ and downloaded from:
#latest release 2023-07-06:
#wget -P ${tmp_path}/mouse_ko/ https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/latest/results/genotype-phenotype-assertions-ALL.csv.gz
#latest release 2023-11-22:
#wget -P ${tmp_path}/mouse_ko/ http://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_mouse_hcop_fifteen_column.txt.gz
orthologs <- read_tsv("human_mouse_hcop_fifteen_column.txt.gz")
all_geno_pheno <- read_csv("genotype-phenotype-assertions-ALL.csv.gz")
geno_pheno <- all_geno_pheno %>%
separate(top_level_mp_term_name, c("top_level_mp_term", "top_level_mp_subtype"), ",")
#display all the possible top level mp term and related number of genes:
geno_pheno %>%
group_by(top_level_mp_term) %>%
summarise(nlines=n(), ngenes=n_distinct(marker_symbol)) %>%
drop_na %>%
pander(big.mark=",")
## Overlap of severe asthma - SA signals with NCBI Refseq genes
refseq.GRanges <- refseq %>%
select(Symbol=name2, chrom, start=txStart, end=txEnd) %>%
makeGRangesFromDataFrame(keep.extra.columns = TRUE)
SA.GRanges <- sentinels %>%
mutate(Chrom=paste0("chr", chr)) %>%
select(Chrom, start=pos, end=pos, sentinel) %>%
makeGRangesFromDataFrame(keep.extra.columns = TRUE)
nearby_genes <- mergeByOverlaps(SA.GRanges, refseq.GRanges, maxgap=DISTANCE) %>%
as_tibble %>%
select(sentinel, Position=SA.GRanges.start, Symbol,
txStart=refseq.GRanges.start, txEnd=refseq.GRanges.end,
width=refseq.GRanges.width) %>%
mutate(distance=ifelse(txStart <= Position & txEnd >= Position, 0,
ifelse(Position < txStart, txStart - Position, Position - txEnd))) %>%
group_by(sentinel, Symbol) %>%
filter(distance == min(distance)) %>%
filter(width == max(width)) %>%
slice(1) %>%
ungroup
n_refseq <- refseq %>% getN(name2)
n_nearby_genes_sentinel <- nearby_genes %>% getN(sentinel)
n_nearby_genes_Symbol <- nearby_genes %>% getN(Symbol)
## Orthologs of mouse KO genes with asthma relevant phenotype - BROAD filter
#BROAD filter for top_level_mp_term == "respiratory system phenotype" give immunity and/or muscle related term as well:
ko_mouse_mp <- geno_pheno %>%
filter(top_level_mp_term == "respiratory system phenotype" |
top_level_mp_term == "immune system phenotype" |
top_level_mp_term == "muscle phenotype") %>%
select(top_level_mp_term, mp_term_name) %>%
distinct %>% arrange(top_level_mp_term,mp_term_name)
#save the top level and suptypes level that I use:
fwrite(ko_mouse_mp,"/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/MP_TERM_approx_asthma_respimmmuscle.txt",quote=F, sep="\t")
#Actual filter for relevant top level and subtypes
ko_mouse <- geno_pheno %>%
filter(top_level_mp_term == "respiratory system phenotype" |
top_level_mp_term == "immune system phenotype" |
top_level_mp_term == "muscle phenotype") %>%
select(marker_symbol, mp_term_name) %>%
distinct
ko_human <- orthologs %>%
filter(support != "OrthoMCL") %>%
select(mouse_symbol, human_symbol) %>%
inner_join(ko_mouse, c("mouse_symbol" = "marker_symbol")) %>%
mutate(overlap=human_symbol %in% nearby_genes$Symbol)
no_ortholog <- setdiff(ko_mouse$marker_symbol, ko_human$mouse_symbol)
ko_human_selected <- ko_human %>%
filter(overlap)
## Hypergeometric test
hitInSample <- nrow(ko_human_selected)
hitInPop <- nrow(ko_human)
failInPop <- n_refseq - hitInPop
sampleSize <- n_nearby_genes_Symbol
p_resp_en <- phyper(q=hitInSample - 1,
m=hitInPop,
n=failInPop,
k=sampleSize,
lower.tail = FALSE)
table3a.m <- matrix(c(hitInSample, hitInPop - hitInSample,
sampleSize - hitInSample, failInPop - sampleSize + hitInSample), 2, 2)
rownames(table3a.m) <- c("yes", "no")
colnames(table3a.m) <- c("asthma", "others")
table3a <- table3a.m %>%
as_tibble(rownames = "Near severe asthma sentinel") %>%
mutate(pc_asthma=(asthma/(asthma + others)) %>% percent)
pander(table3a, big.mark=",", caption=paste0("Table 3: For mouse knockout-causing genes, the proportion that are asthma related within ", DISTANCE/1000, "kb of an asthma function sentinel."))
### By-gene and by-SNP results
concat <- function(x) x[!is.na(x)] %>% unique %>% paste(collapse="; ")
implicated_mko <- right_join(nearby_genes, ko_human_selected, c("Symbol" = "human_symbol")) %>%
add_column(MKO="MKO", .before = "txStart")
results <- left_join(sentinels, implicated_mko)
results_mko <- results %>%
filter(!is.na(MKO)) %>%
arrange(distance, chr, pos) %>%
ungroup
results_by_snp <- results %>%
arrange(distance) %>%
mutate_at(vars(MKO), ~ifelse(!is.na(.), .data$Symbol, NA)) %>%
group_by(sentinel) %>%
summarise_at(vars(MKO, distance), concat) %>%
ungroup %>%
left_join(sentinels, .)
results_by_snp_phyper <- results_mko %>%
group_by(sentinel) %>%
summarise(hitInSample_snp=n()) %>%
ungroup %>%
left_join(nearby_genes %>%
group_by(sentinel) %>%
summarise(sampleSize_snp=n())) %>%
mutate(hyperG_p=phyper(q=hitInSample_snp - 1,
m=hitInPop,
n=failInPop,
k=sampleSize_snp,
lower.tail = FALSE)) %>%
left_join(results_by_snp, .)
results_by_gene <- results %>%
filter(!is.na(Symbol)) %>%
arrange(distance) %>%
mutate_at(vars(MKO), ~ifelse(!is.na(.), .data$sentinel, NA)) %>%
group_by(Symbol) %>%
summarise_at(vars(MKO, distance), concat) %>%
ungroup
results_by_snp_phyper %>% filter(!is.na(hyperG_p)) %>% count(hyperG_p < 0.05) %>%
pander(caption="By SNP hypergeometric results")
out_base <- paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/mouse_ko/results_", DISTANCE/1000, "kb")
write_csv(results, paste0(out_base, ".csv"), na = "")
write_csv(results_mko, paste0(out_base, "_mko.csv"), na = "")
write_csv(results_by_snp_phyper, paste0(out_base, "_by_snp.csv"), na = "")
write_csv(results_by_gene, paste0(out_base, "_by_gene.csv"), na = "") # gives me the gene list
MP_TERM <- unique(ko_mouse_mp$mp_term_name)
inner_join(results_mko %>%
select(-MKO, -txStart, -txEnd, -width, -overlap),
results_by_snp_phyper %>% select(sentinel, hyperG_p)) %>%
inner_join(all_geno_pheno %>%
filter(mp_term_name %in% MP_TERM) %>%
select(marker_symbol, mp_term_name, p_value, percentage_change, effect_size),
c("mouse_symbol" = "marker_symbol")) %>%
group_by(Symbol, mouse_symbol) %>%
arrange(p_value) %>%
slice(1) %>%
write_csv("mouse_ko.csv")
#save genes only:
fwrite(as.data.frame(results_by_gene$Symbol), "/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/mouse_ko_genes_raw.txt", na = "",col.names=F,quote=F)
Using the International Mouse Phenotyping (website link), I analysed if any human ortholog gene has been studied in mouse model in the context of respiratory, immune or muscle phenotypes (top Mouse Phenotype (MP) terms ‘respiratory system phenotype’/‘immune system phenotype’/‘muscle phenotype’. I selected relevant genes within 500Kb from the top causal variant for each fine mapped locus.
src/rare_variant/000_dataprep_rarevar.R
#!/usr/bin/env Rscript
#Rationale: data prep for rare-variant analysis in the RAP - phenotype-covariate file
library(tidyverse)
library(data.table)
bridge_file <- "/data/gen1/UKBiobank/application_88144/Bridge_eids_88144_56607.csv"
pheno_cov_file <- "/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/demo_EUR_pheno_cov_broadasthma.txt"
#match ID application 56607 to obtain phenotype-covariate file with ID application 88144:
bridge <- fread(bridge_file)
bridge$FID <- bridge$eid_88144
bridge$IID <- bridge$eid_88144
pheno_cov <- fread(pheno_cov_file) %>% select(-FID,-IID)
pheno_cov <- pheno_cov %>% rename("eid_56607" = eid)
pheno_cov_88144 <- left_join(pheno_cov,bridge,by="eid_56607") %>% relocate(IID, .before = missing) %>% relocate(FID, .before = IID)
write.table(pheno_cov_88144,"/scratch/gen1/nnp5/Var_to_Gen_tmp/rare_variant/demo_EUR_pheno_cov_broadasthma_app88144.txt",
row.names = FALSE, col.names = TRUE ,quote=FALSE, sep=" ", na = "NA")
src/rare_variant/README.txt
#Rationale: Micellaneous knowldege about dx toolkit, RAP and CLI to run the rare-variant single and gene-based collapsing analysis analysis step by step
#In REGENIE, phenotype and covariates column names:
file: /rfs/TobinGroup/data/UKBiobank/application_88144/demo_EUR_pheno_cov_broadasthma_app88144.txt
pheno="broad_pheno_1_5_ratio"
--covarColList age_at_recruitment,age2,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,genetic_sex
#Download the CLI:
https://documentation.dnanexus.com/downloads
#Tutorial videos at:
https://dnanexus.gitbook.io/uk-biobank-rap/working-on-the-research-analysis-platform/accessing-and-using-the-command-line-interface
#When indexing files on the RAP in the CLI: always put the project ID, it makes it run faster:
project-GGzFY70JBJzVx22v4Yj980J1:/Bulk/Genotype Results/CEL files/11/
#Upload the phenotype file on the platform through dx toolkit:
dx login
dx logout
dx upload /rfs/TobinGroup/data/UKBiobank/application_88144/demo_EUR_pheno_cov_broadasthma_app88144.txt
dx describe #description of the tool
src/rare_variant/submit_rare_variant.sh
#!/bin/bash
#Rationale: run Single-variant rare-variant ExWAS in the RAP from command-line interface CLI, using dx toolkit syntax
#Command-line interface (script)
#Script form Kath:
#What follows is my bash script for running a GWAS (using regenie) on the command-line interface. I used the genotyping data for the first step and the exome data for the second. For more information on the command-line interface and how to use it, I'd recommend the videos on this page: https://dnanexus.gitbook.io/uk-biobank-rap/working-on-the-research-analysis-platform/accessing-and-using-the-command-line-interface
#https://dnanexus.gitbook.io/uk-biobank-rap/getting-started/research-analysis-platform-training-webinars
cd /home/n/nnp5/PhD/PhD_project/Var_to_Gene/
#Start of script for regenie GWAS:
dx login
# You will need to enter your username and password at this stage, and select the number corresponding to your project from the list
dx select Severe_asthma
dx mkdir analysis
data_file_dir="/analysis/"
# Merging chromosomes together (genotyping data):
#The RAP folders names have space on that ! so need to use '\' to specify that:
# demo_EUR_pheno_cov_broadasthma_app88144.txt is the phenotype file I generated on ALICE3.
# running locally in the CLI, and then it gives a job that I am able to monitor on the RAP.
run_merge="cp /mnt/project/Bulk/Genotype\ Results/Genotype\ calls/ukb22418_c[1-9]* . ; ls *.bed | sed -e 's/.bed//g'> files_to_merge.txt; plink --merge-list files_to_merge.txt --make-bed --autosome-xy --out ukb22418_c1_22_v2_merged; rm files_to_merge.txt;"
dx run swiss-army-knife -iin="/demo_EUR_pheno_cov_broadasthma_app88144.txt" -icmd="${run_merge}" --tag="Merge" --instance-type "mem1_ssd1_v2_x16" --destination="${data_file_dir}" --brief --yes
# QC-ing the merged genotyping data:
awk {'print $1, $2'} /rfs/TobinGroup/data/UKBiobank/application_88144/demo_EUR_pheno_cov_broadasthma_app88144.txt | tail -n +2 > /scratch/gen1/nnp5/Var_to_Gen_tmp/rare_variant/ukb_eur_ids
dx cd analysis
dx upload ukb_eur_ids
run_plink_qc="plink2 --bfile ukb22418_c1_22_v2_merged --keep ukb_eur_ids --autosome --maf 0.01 --mac 20 --geno 0.1 --hwe 1e-15 --mind 0.1 --write-snplist --write-samples --no-id-header --out snp_qc_pass"
dx run swiss-army-knife -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.bed" -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.bim" -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.fam" -iin="${data_file_dir}/ukb_eur_ids" -icmd="${run_plink_qc}" --tag="plink_QC_eur_ukb" --instance-type "mem1_ssd1_v2_x16" --destination="${data_file_dir}" --brief --yes
# Running regenie step 1:
##Using data that represents the whole genome, either imputed or whole genome seq:
run_regenie_step1="regenie --step 1 \
--lowmem \
--out sa_results \
--bed ukb22418_c1_22_v2_merged \
--phenoFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
--covarFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
--extract snp_qc_pass.snplist \
--phenoCol broad_pheno_1_5_ratio \
--covarCol age_at_recruitment,age2,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,genetic_sex \
--bsize 1000 \
--bt \
--loocv \
--gz \
--threads 16"
dx run swiss-army-knife -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.bed" -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.bim" -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.fam" -iin="/demo_EUR_pheno_cov_broadasthma_app88144.txt" -iin="${data_file_dir}/snp_qc_pass.snplist" -icmd="${run_regenie_step1}" --tag="Step1" --instance-type "mem1_ssd1_v2_x16" --destination="${data_file_dir}" --brief --yes
# QC-ing the exome data:
exome_file_dir="/Bulk/Exome sequences/Population level exome OQFE variants, PLINK format - final release"
field_name="ukb23158"
for chr in {1..22}
do
run_plink_wes="plink2 \
--bfile ukb23158_c${chr}_b0_v1 \
--no-psam-pheno \
--keep ukb_eur_ids \
--autosome \
--mac 3 \
--geno 0.1 \
--hwe 1e-15 \
--mind 0.1 \
--write-snplist \
--write-samples \
--no-id-header \
--out WES_c${chr}_snps_qc_pass"
dx run swiss-army-knife \
-iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.bed" \
-iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.bim" \
-iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.fam" \
-iin="${data_file_dir}/ukb_eur_ids" \
-icmd="${run_plink_wes}" \
--tag="QC_ExWAS" \
--instance-type "mem1_ssd1_v2_x16" \
--name "QC_chr${chr}" \
--destination="${data_file_dir}" \
--brief \
--yes
done
# Running regenie step 2:
exome_file_dir="/Bulk/Exome sequences/Population level exome OQFE variants, PLINK format - final release"
EXCLUDE="ukb23158_500k_OQFE.90pct10dp_qc_variants.txt"
field_name="ukb23158"
for chr in {1..22}
do
run_regenie_cmd="regenie \
--step 2 \
--bed ukb23158_c${chr}_b0_v1 \
--out ExWAS_SA_assoc.c${chr} \
--phenoFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
--covarFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
--bt \
--approx \
--firth-se \
--firth \
--extract WES_c${chr}_snps_qc_pass.snplist \
--exclude $EXCLUDE \
--phenoCol broad_pheno_1_5_ratio \
--covarCol age_at_recruitment,age2,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,genetic_sex \
--bsize 400 \
--pred sa_results_pred.list \
--pThresh 0.05 \
--minMAC 3 \
--threads 16 \
--gz"
dx run swiss-army-knife \
-iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.bed" \
-iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.bim" \
-iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.fam" \
-iin="${data_file_dir}/WES_c${chr}_snps_qc_pass.snplist" \
-iin="${exome_file_dir}/helper_files/${EXCLUDE}" \
-iin="/demo_EUR_pheno_cov_broadasthma_app88144.txt" \
-iin="${data_file_dir}/sa_results_pred.list" \
-iin="${data_file_dir}/sa_results_1.loco.gz" \
-icmd="${run_regenie_cmd}" \
--tag="Step2" \
--instance-type "mem1_ssd1_v2_x16" \
--name="regenie_step2_chr${chr}" \
--destination="${data_file_dir}" \
--brief \
--yes
done
#Downlaod a log file to have it locally on ALICE3:
#Var_to_Gene/input/regenie_step2_chr5:
#REGENIE v3.1.1.gz
#* case-control counts for each trait:
#'broad_pheno_1_5_ratio': 7413 cases and 36955 controls
# Merging output of regenie and formatting for visualising using LocusZoom:
merge_cmd='out_file="assoc.regenie.merged.txt"; cp /mnt/project/analysis/*.regenie.gz . ; gunzip *.regenie.gz ; echo -e "#CHROM\tGENPOS\tID\tALLELE0\tALLELE1\tA1FREQ\tN\tTEST\tBETA\tSE\tCHISQ\tLOG10P\tEXTRA" > $out_file ; files="./*.regenie"; for f in $files; do tail -n+2 $f | tr " " "\t" >> $out_file; done ; rm *.regenie'
dx run swiss-army-knife \
-iin="${data_file_dir}/ExWAS_SA_assoc.c1_broad_pheno_1_5_ratio.regenie.gz" \
-icmd="${merge_cmd}" \
--tag="Merge_regenie_results" \
--instance-type "mem1_ssd1_v2_x16" \
--destination="${data_file_dir}" \
--brief \
--yes
done
#Run LocusZoom:
#On the RAP Platform:
#Tools > Tools Library > LocusZoom > Run
#Output folder: Severe_asthma/analysis/
#Input files: Severe_asthma/analysis/assoc.regenie.merged.txt"
#JobName: LocusZoom_single_rarevar_ExWAS
#and then WORKER URL to select the column:
#Build38
#Chromosome: CHROM
#Ref allele: ALLELE0
#P-value column: LOG10P tick 'is -log10(p)'
#Position: GENPOS
#Alt alelle: ALLELE1
#Effect size: BETA
#Std Err.: SE
#Affect allele: Alt
#Specify from: Frequency
#Frequency: A1FREQ
#Dowload summary statistics, log, screenshot of LocusZoom and QQPlot and upload in /Var_to_Gene/input/rare_variant/single_rarevar_ExWAS/
#Let's filter our data first
#MAF < 0.01 (Because for my common variants SA GWAS, I filter for MAF >= 0.01) and p-value <= 5E-6 (as suggestive threshold):
Rscript src/rare_variant/create_input_munge_summary_stats.R \
input/rare_variant/single_rarevar_EwWAS/summary_stats.gz \
"SA"
#Look-up of exonic rare variants (MAF < 0.01) within +/- 500Kb of credible set variants for each locus single variant and gene-based results.
#Genes of exonic rare variants in the regions with suggestive p-value <= 5E-6 were identified.
for line in {1..615}
do
chr=$(awk -v row="$line" ' NR == row {print $1 } ' input/cs_vars_liftover_output.bed | sed 's/chr//g')
pos=$(awk -v row="$line" 'NR == row {print $2 }' input/cs_vars_liftover_output.bed)
awk -v chr_idx=$chr -v pos_idx=$pos '$2 == chr_idx && $3 >= pos_idx-500000 && $3 <= pos_idx+500000 {print}' input/rare_variant/single_rarevar_EwWAS/SArarevar_suggestive
done
##no Look-up results
#Discovery analysis of significant associated variants:
#Significant threshold: 0.05/N-markers: 0.05/1772264
pval_thr=$(0.05/1772264)
Rscript src/rare_variant/sentinel_selection.R \
input/rare_variant/single_rarevar_EwWAS/SArarevar_betase_input_mungestat \
"SA"\
500000 \
0.05 \ #nominal threshold
1772264 #N_markers for correction
src/rare_variant/create_input_munge_summary_stats.R
#!/usr/bin/env Rscriptcd all
#Rationale filter out rare variants form ExWAS.
library(data.table)
library(tidyverse)
args = commandArgs(trailingOnly=TRUE)
df_file <- args[1]
pheno <- as.character(args[2])
df <- fread(df_file,header=F,sep="\t")
#"#chrom" "pos" "rsid" "ref"
#"alt" "neg_log_pvalue" "beta" "stderr_beta"
#"alt_allele_freq"
#CHROM GENPOS ID ALLELE0 ALLELE1 A1FREQ INFO N TEST BETA SE CHISQ LOG10P
#REGENIE: reference allele (allele 0), alternative allele (allele 1)
#REGENIE: estimated effect sizes (for allele 1 on the original scale)
colnames(df) <- c("CHROM", "GENPOS", "ID", "ALLELE0", "ALLELE1", "LOG10P", "BETA", "SE", "A1FREQ")
dfclean <- df %>% select(CHROM,ID,GENPOS,ALLELE1,ALLELE0,A1FREQ,LOG10P,BETA,SE)
dfclean$pval <- 10^(-as.numeric(dfclean$LOG10P))
#select column in order:
dfclean <- dfclean %>% select(ID,CHROM,GENPOS,ALLELE1,ALLELE0,BETA,SE,A1FREQ,pval)
#rename columns :
colnames(dfclean) <- c("snpid", "chr", "bp38", "a1", "a2", "LOG_ODDS", "se", "eaf", "pval")
#make sure eaf column is numeric:
dfclean$eaf <- as.numeric(dfclean$eaf)
#create MAF columns and filtered out for MAF >= 0.01:
dfclean <- dfclean %>% mutate(MAF = ifelse(dfclean$eaf <= 0.5, dfclean$eaf, 1 - dfclean$eaf))
dfclean_rarevar <- dfclean %>% filter(MAF < 0.01)
#save gwas results for variants with MAF < 0.01:
write.table(dfclean_rarevar,paste0("input/rare_variant/single_rarevar_EwWAS/",pheno,"rarevar_betase_input_mungestat"),quote=FALSE,sep="\t",row.names=FALSE)
#save gwas results for variants with MAF < 0.01 and pvalue <= 0.000005:
dfclean_rarevar_sugg <- dfclean_rarevar %>% filter(pval <= 0.000005)
write.table(dfclean_rarevar_sugg,paste0("input/rare_variant/single_rarevar_EwWAS/",pheno,"rarevar_suggestive"),quote=FALSE,sep="\t",row.names=FALSE)
src/rare_variant/submit_collapsing_bt.sh
#!/bin/bash
#Rationale: gene-based collapsing rare variant ExWAS - Based on Nick's script
#guidelines at:
#https://dnanexus.gitbook.io/uk-biobank-rap/science-corner/using-regenie-to-generate-variant-masks
#https://community.dnanexus.com/s/question/0D582000004zTItCAM/burden-test-from-regenie-in-ukb-wes-data-the-result-does-not-contain-alleles-or-beta-and-se
#https://github.com/rgcgithub/regenie/issues/327
dx login
dx select Severe_asthma
dx mkdir /analysis/collapsing_Hg38/
dx cd /analysis/collapsing_Hg38/
dx upload /scratch/gen1/nrgs1/rare_variant/collapsing_Hg38/ukb23158_500k_OQFE.annotations.txt.gz
dx upload /scratch/gen1/nrgs1/rare_variant/collapsing_Hg38/ukb23158_500k_OQFE.sets.txt.gz
#Create the backmask.def file:
printf "M1 LoF\nM3 LoF,missense(5/5)\nM4 LoF,missense(5/5),missense(>=1/5)" > /scratch/gen1/nnp5/Var_to_Gen_tmp/rare_variant/backmask.def
dx upload /scratch/gen1/nnp5/Var_to_Gen_tmp/rare_variant/backmask.def
dx cd ../..
SEED=/analysis/
BASE=/collapsing_Hg38/
EXOME_PATH="/Bulk/Exome sequences/Population level exome OQFE variants, PLINK format - final release"
ANNOT=ukb23158_500k_OQFE.annotations.txt.gz
SETLIST=ukb23158_500k_OQFE.sets.txt.gz
EXCLUDE=ukb23158_500k_OQFE.90pct10dp_qc_variants.txt
MASK=backmask.def
AAF_BINS=0.01,0.001,0.0001,0.00001
JOINT_TESTS=acat
MAXAFF=0.01
TESTS=acatv,skato
THREADS=16
if [ $# -gt 1 ]; then
BUILD_MASK="--build-mask sum"
OUT=${OUT}_sum
NAME=${NAME}_sum
else
BUILD_MASK="--write-mask --write-mask-snplist"
fi
for CHR in {1..22}
do
NAME=sa_collapsing_chr${CHR}_gene_p
BED=ukb23158_c${CHR}_b0_v1
OUT=${BED}_sa_collapsing_backman_gene_p
REGENIE_CMD="regenie \
--bed $BED \
--exclude $EXCLUDE \
--extract WES_c${CHR}_snps_qc_pass.snplist \
--step 2 \
--pred sa_results_pred.list \
--phenoFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
--covarFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
--phenoCol broad_pheno_1_5_ratio \
--covarCol age_at_recruitment,age2,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,genetic_sex \
--bt \
--anno-file ${ANNOT} \
--set-list ${SETLIST} \
--mask-def ${MASK} $BUILD_MASK \
--firth --approx --pThresh 0.01 \
--bsize 400 \
--minMAC 3 \
--aaf-bins $AAF_BINS \
--joint $JOINT_TESTS \
--vc-maxAAF $MAXAFF \
--vc-tests $TESTS \
--rgc-gene-p \
--threads=$THREADS \
--out $OUT"
dx run swiss-army-knife \
-iin="${EXOME_PATH}/${BED}.bed" \
-iin="${EXOME_PATH}/${BED}.bim" \
-iin="${EXOME_PATH}/${BED}.fam" \
-iin="${SEED}/WES_c${CHR}_snps_qc_pass.snplist" \
-iin="/demo_EUR_pheno_cov_broadasthma_app88144.txt" \
-iin="${SEED}/sa_results_pred.list" \
-iin="${SEED}/sa_results_1.loco.gz" \
-iin="${SEED}/${BASE}/${ANNOT}" \
-iin="${EXOME_PATH}/helper_files/${EXCLUDE}" \
-iin="${SEED}/${BASE}/${SETLIST}" \
-iin="${SEED}/${BASE}/${MASK}" \
-icmd="$REGENIE_CMD" \
--instance-type "mem1_ssd1_v2_x16" \
--name="$NAME" \
--destination="${SEED}/${BASE}" \
--brief --yes --allow-ssh
done
#Downlaod files in ALICE3:
mkdir /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/genecollaps_ExWAS
dx download project-GGzFY70JBJzVx22v4Yj980J1:/analysis/collapsing_Hg38/ukb23158_c*
#Find gene p-value <= 5^10-6:
mkdir /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/rare_variant/genecollap_ExWAS
awk '$12 = 10^(-$12)' \
/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/genecollaps_ExWAS/ukb23158_c*_b0_v1_sa_collapsing_backman_gene_p_broad_pheno_1_5_ratio.regenie | \
grep -v "^#" | awk 'NR > 1 && $12 <= 0.000005 {print}' \
> input/rare_variant/genecollap_ExWAS/SArarevar_genecollap_suggestive
##Look-up of exonic rare variants (MAF < 0.01) within +/- 500Kb of credible set variants for each locus single variant and gene-based results.
##Genes of exonic rare variants in the regions with suggestive p-value <= 5E-6 were identified.
for line in {1..615}
do
chr=$(awk -v row="$line" ' NR == row {print $1 } ' input/cs_vars_liftover_output.bed | sed 's/chr//g')
pos=$(awk -v row="$line" 'NR == row {print $2 }' input/cs_vars_liftover_output.bed)
awk -v chr_idx=$chr -v pos_idx=$pos '$1 == chr_idx && $2 >= pos_idx-500000 && $3 <= pos_idx+500000 {print}' input/rare_variant/genecollap_ExWAS/SArarevar_genecollap_suggestive
done
##no Look-up results
I obtained results for single variant and gene-based exome-wide
association analysis including 7,413 cases and 36,955 controls, and
covariates
age_at_recruitment,age2,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,genetic_sex.
ExWASes were performed using the tool REGENIE v3.1.1 and the Research
Analysis Platform (RAP, https://ukbiobank.dnanexus.com/landing).
To
identify rare variant suggestive association and mapped genes, I
implemented a look-up analysis for exonic rare variants (MAF < 0.01)
within +/- 500Kb from common credible set variants.
To combine the results from all the distinct analyses and see which
genes were highlighted:
src/genes_heatmap.R
#!/usr/bin/env Rscript
#Rationale: visualisation of genes prioritised from all analyses
library(scales)
library(tidyverse)
library(data.table)
library(readxl)
library(RColorBrewer)
#input each analysis table:
genes_raw <- "input/var2genes_raw.xlsx"
nearest_gene <- read_excel(genes_raw, sheet = "nearest_genes",col_names = "gene")
nearest_gene$evidence <- "nearest"
annotation <- read_excel(genes_raw, sheet = "varannot_genes",col_names = "gene")
annotation$evidence <- as.factor("func_annot")
eqtl <- read_excel(genes_raw, sheet = "eQTL_genes_merge")
pqtl <- read_excel(genes_raw, sheet = "pQTL_genes_merge")
pops <- read_excel(genes_raw, sheet = "PoPS_genes",col_names = "gene")
pops$evidence <- as.factor("PoPS")
mouseko <- read_excel(genes_raw, sheet = "Mouseko_genes",col_names = "gene")
mouseko$evidence <- as.factor("mouse_KO")
raredis <- read_excel(genes_raw, sheet = "Raredisease_genes",col_names = "gene")
raredis$evidence <- as.factor("rare_disease")
#merge:
v2g_full <- rbind(nearest_gene,annotation,eqtl,pqtl,pops,mouseko,raredis)
#table with all the possible combination:
v2g_full_combination <- unique(expand.grid(x = v2g_full$gene, y = v2g_full$evidence, KEEP.OUT.ATTRS = TRUE)) %>% arrange(x)
colnames(v2g_full_combination) <- colnames(v2g_full)
#mask (conceptually: v2g_full_combination %in% v2g_full):
mask <- as.data.frame(do.call(paste0, v2g_full_combination) %in% do.call(paste0, v2g_full)) %>% rename(status='do.call(paste0, v2g_full_combination) %in% do.call(paste0, v2g_full)')
v2g_full_combination <- cbind(v2g_full_combination,mask) %>% mutate(status=as.integer(as.logical(status))) %>% arrange(status,gene)
#long to wide reshape using spread() in tidyr:
v2g_full_combination <- spread(v2g_full_combination, evidence, status) %>% remove_rownames %>% column_to_rownames(var="gene")
# Convert row names into first column
v2g_full_combination <- setDT(v2g_full_combination, keep.rownames = TRUE)[]
setnames(v2g_full_combination, "rn", "gene")
# reshape your data
v2g_full_combination2 <- melt(v2g_full_combination, id.var = "gene")
length(unique(v2g_full_combination2$gene))
fwrite(v2g_full_combination2,"./output/v2g_gene_prioritisation.txt",sep="\t",quote=F)
# Plot
##use this to find which colour I want: fro the pie, I extract the index for the colour I want:
##everytime it changes, so I noted down the actual code of the colour: ##18: col[18] #9EBCDA
#n <- 30
#colrs <- brewer.pal.info[brewer.pal.info$colorblind == TRUE, ]
#col_vec = unlist(mapply(brewer.pal, colrs$maxcolors, rownames(colrs)))
#col <- sample(col_vec, n)
#area <- rep(1,n)
#pie(area, col = col)
#fnct for the plot:
fc_heatmap_all <- function(df,x_val,y_val,fill_val) {
ggplot(df, aes(x=x_val, y=y_val, fill=fill_val)) +
geom_tile() +
scale_fill_gradient2(low = "white",
mid = "white",
high = "#9EBCDA") +
scale_x_discrete(position = "top") +
theme(axis.text = element_text(face = "bold"), axis.ticks.y=element_blank(), axis.ticks.x=element_blank()) +
xlab("Evidence") + ylab("Gene") + theme(legend.position = "none") +
ggtitle("Gene prioritization")
}
#Split plot into 9 plots of 11 genes each:
test <- v2g_full_combination2 %>%
arrange(gene, variable) %>%
group_by() %>%
mutate(facet=c(rep(9, ceiling(n()/9)),rep(8, ceiling(n()/9)),rep(7, ceiling(n()/9)),rep(6, ceiling(n()/9)),rep(5, ceiling(n()/9)),rep(4, ceiling(n()/9)),rep(3, ceiling(n()/9)),rep(2, ceiling(n()/9)), rep(1, floor(n()/9)))) %>%
ungroup
png("./output/V2G_heatmap_subplots.png", width=1000, height = 900)
fc_heatmap_all(test,test$variable,test$gene,test$value) + facet_wrap(~facet, scales="free", ncol=3) +
theme(strip.background = element_blank(), strip.text = element_blank(),
axis.text.x=element_text(angle=75, vjust=0, hjust=0, face="bold"),
axis.title=element_blank(),
axis.text.y=element_text(hjust=1, face="italic"),
axis.ticks = element_blank(),
legend.title = element_blank())
dev.off()
To understand locus-gene(s) association:
src/Locus_to_genes_table.R
#!/usr/bin/env Rscript
#Rationale: create table with results from each individual analysis:
#evidence signal gene locus start end chr pos trait effect other eaf Z P Novel width
library(tidyverse)
library(data.table)
library(writexl)
library(readxl)
#Cred_set variants:
#in credset, a1 is actually allele2 in gwas; a2 is actually allele1 in gwas
credset <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt") %>% select(-PIP_finemap,-PIP_susie) %>% rename(posb37=position,a1=allele2,a2=allele1,chr=chromosome)
#add posb38:
b38 <- fread("input/cs_vars_liftover_output.bed")
colnames(b38) <- c("chr_b38","posb38","pos1_b38")
credset_b38 <- cbind(credset,b38) %>% select(-chr_b38, -pos1_b38) %>% relocate(posb38, .after = posb37)
#add gwas info to credset with also posb38:
gwas <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat") %>% rename(chr=b37chr,posb37=bp)
credset_gwas <- credset_b38 %>% left_join(gwas,by=c("chr","posb37","a1","a2","snpid"))
credset_gwas <- credset_gwas %>% arrange(chr,posb37)
credset_gwas$sentinel <- paste0(credset_gwas$chr,"_",credset_gwas$posb37,"_",credset_gwas$a2,"_",credset_gwas$a1)
##Nearest gene:
ng <- fread("output/PIP_sentinels_nearestgenes") %>% select(locus,sentinel,Nearest_gene)
ng$evidence <- as.factor("nearest_gene")
credset_gwas_ng <- credset_gwas %>% left_join(ng, by=c("sentinel","locus"))
##extract only variants with evidence for nearest gene - the sentinel ones cus I did nearest gene by locus:
credset_gwas_ng2 <- credset_gwas_ng %>% filter(!is.na(evidence))
##Functional annotation:
#fantom5
fantom5 <- fread("output/fnc_annot_fantom5") %>% rename(chr=Chromosome,posb38=Position)
fantom5$evidence <- as.factor("fantom5")
credset_gwas_fantom5 <- credset_gwas %>% left_join(fantom5, by=c("chr","posb38"))
#integrative scores:
inscores.aPCs <- fread("output/fnc_annot_inscores") %>% rename(chr=Chromosome,posb38=Position)
inscores.aPCs$evidence <- as.factor("inscores.aPCs")
credset_gwas_inscores.aPCs <- credset_gwas %>% left_join(inscores.aPCs, by=c("chr","posb38"))
#ClinVar:
clinvar <- fread("output/fnc_annot_clinvar") %>% rename(chr=Chromosome,posb38=Position)
clinvar$evidence <- as.factor("clinvar")
credset_gwas_clinvar <- credset_gwas %>% left_join(clinvar, by=c("chr","posb38"))
##Merge together the three functional criteria:
col_for_join <- c("locus","snpid","chr","posb37","posb38","a2","a1","PIP_average","LOG_ODDS","se","eaf","pval","MAF","sentinel","evidence")
fantom5_inscores <- full_join(credset_gwas_fantom5,credset_gwas_inscores.aPCs,by=col_for_join)
fantom5_inscores_clinvar <- full_join(fantom5_inscores,credset_gwas_clinvar,by=col_for_join) %>%
filter(!is.na(evidence)) %>%
unite(col='Gene', c('Nearest_gene', 'Genecode.Comprehensive.Info', 'Gene.Reported'),na.rm=TRUE)
##eQTL colocalisation:
#GTExV8:
gtex_coloc <- fread("output/coloc_asthma_GTEx.tsv") %>% select(snp, gene, tissue)
gtex_colocsusie <- fread("output/colocsusie_asthma_GTEx.tsv") %>% select(snp, gene, tissue)
gtex <- rbind(gtex_coloc, gtex_colocsusie) %>% rename(sentinel_gtex=snp)
gtex$evidence <- as.factor("eqtl_gtex")
credset_gwas$sentinel_gtex <- paste0(credset_gwas$chr,"_",credset_gwas$posb37,"_",credset_gwas$a1,"_",credset_gwas$a2)
credset_gwas_gtex <- credset_gwas %>% left_join(gtex, by="sentinel_gtex")
credset_gwas_gtex2 <- credset_gwas_gtex %>% filter(!is.na(evidence)) %>% rename(gene_ensembl=gene)
#put gene symbol instead of ensembl:
gtex_genesymbol <- read_excel("input/var2genes_raw.xlsx", sheet="GTExV8_eQTL_genes_symbol",col_names=F) %>% rename(gene_ensembl="...1",gene="...2")
credset_gwas_gtex2 <- credset_gwas_gtex2 %>% left_join(gtex_genesymbol,by="gene_ensembl") %>% select(-gene_ensembl)
#eqtlGen:
eqtlgen<- fread("output/coloc_asthma_eqtlgen.tsv") %>% select(snp, gene, tissue) %>% rename(sentinel_eqtlgen=snp)
eqtlgen$evidence <- as.factor("eqtl_eqtlgen")
credset_gwas$sentinel_eqtlgen <- paste0(credset_gwas$chr,"_",credset_gwas$posb37,"_",credset_gwas$a1,"_",credset_gwas$a2)
credset_gwas_eqtlgen <- credset_gwas %>% left_join(eqtlgen, by="sentinel_eqtlgen")
credset_gwas_eqtlgen2 <- credset_gwas_eqtlgen %>% filter(!is.na(evidence))
#UBCLung:
ubclung <- fread("output/coloc_asthma_ubclung.tsv") %>% select(snp, gene_id, tissue) %>% rename(sentinel_ubclung=snp)
ubclung$evidence <- as.factor("eqtl_ubclung")
credset_gwas$sentinel_ubclung <- paste0(credset_gwas$chr,"_",credset_gwas$posb37,"_",credset_gwas$a1,"_",credset_gwas$a2)
credset_gwas_ubclung <- credset_gwas %>% left_join(ubclung, by="sentinel_ubclung")
credset_gwas_ubclung2 <- credset_gwas_ubclung %>% filter(!is.na(evidence))
##pQTL look-up:
#UKB pQTL:
ukbpqtl <- fread("output/lookup_ukbpqtl.txt")
ukbpqtl <- ukbpqtl %>% separate(LOCUS, c("locus", "sentinel_ukbpqtl"), sep = "/")
ukbpqtl <- ukbpqtl %>% rename(chr=CHROM,posb38=GENPOS)
ukbpqtl$evidence <- as.factor("pqtl_ukb")
credset_gwas_ukbpqtl <- credset_gwas %>% left_join(ukbpqtl, by=c("locus","chr","posb38"))
credset_gwas_ukbpqtl2 <- credset_gwas_ukbpqtl %>% filter(!is.na(evidence))
#SCALLOP:
#Colnmaes:#Chrom Pos MarkerName Allele1 Allele2 Freq1 FreqSE Effect StdErr P-value Direction TotalSampleSize Gene
scallop <- fread("output/scallop_ukbpqtl.txt",head=F,sep="\t")
scallop <- scallop %>% separate(V12, c("TotalSampleSize","Gene"),sep=" ")
colnames(scallop) <- c("Chrom","Pos","MarkerName","Allele1","Allele2","Freq1","FreqSE","Effect", "StdErr","P-value","Direction","TotalSampleSize","Gene")
scallop <- scallop %>% rename(chr=Chrom,posb37=Pos)
scallop$evidence <- as.factor("pqtl_scallop")
credset_gwas_scallop <- credset_gwas %>% left_join(scallop, by=c("chr","posb37"))
credset_gwas_scallop2 <- credset_gwas_scallop %>% filter(!is.na(evidence))
##PoPS:
pops <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_results_merged_table.txt")
pops <- pops %>% rename(locus=signal_id) %>% relocate(gene_symbol, .before = ENSGID)
pops$evidence <- as.factor("pops")
credset_gwas_pops <- credset_gwas %>% left_join(pops, by=c("locus","sentinel"))
credset_gwas_pops2 <- credset_gwas_pops %>% filter(!is.na(evidence))
##Mouse_ko
mko <- fread("input/mko_results_500kb.csv") %>% filter(overlap == TRUE) %>% select(-Position, -overlap, -MKO) %>% rename(posb37=pos) %>% arrange(chr,posb37)
mko$evidence <- as.factor("mouse_ko")
credset_gwas_mko <- credset_gwas %>% left_join(mko, by=c("locus","sentinel","chr","posb37"))
credset_gwas_mko2 <- credset_gwas_mko %>% filter(!is.na(evidence))
##Rare disease
raredis <- fread("input/raredis_results_500kb_by_gene.csv")
colnames(raredis) <- c("Symbol","sentinel","frequency_disease","distance","name_disease","HPOterm")
raredis$evidence <- as.factor("rare_disease")
credset_gwas_raredis <- credset_gwas %>% left_join(raredis, by= "sentinel")
credset_gwas_raredis2 <- credset_gwas_raredis %>% filter(!is.na(evidence))
#Merge the results altogether to obtain a single file with locus - snp - gene - evidence:
#only thing that can be different are 'evidence' and 'gene'
#and 'tissue' only for eQTL data:
col_for_join <- c("locus","snpid","chr","posb37","posb38","a2","a1","PIP_average","LOG_ODDS","se","eaf","pval","MAF","sentinel","evidence","gene","tissue")
credset_gwas_gtex2 <- credset_gwas_gtex2 %>% rename(gene=gene)
credset_gwas_eqtlgen2 <- credset_gwas_eqtlgen2 %>% rename(gene=gene)
credset_gwas_ubclung2 <- credset_gwas_ubclung2 %>% rename(gene=gene_id)
eqtl_all <- credset_gwas_gtex2 %>%
full_join(credset_gwas_eqtlgen2,by=col_for_join)%>%
full_join(credset_gwas_ubclung2, by=col_for_join) %>%
select("locus","snpid","chr","posb37","posb38","a2","a1","PIP_average","LOG_ODDS","se","eaf","pval","MAF","sentinel","evidence","gene","tissue")
col_for_join <- c("locus","snpid","chr","posb37","gene","evidence","posb38","a2","a1","PIP_average","LOG_ODDS","se","eaf","pval","MAF","sentinel")
credset_gwas_ng2 <- credset_gwas_ng2 %>% rename(gene=Nearest_gene)
fantom5_inscores_clinvar <- fantom5_inscores_clinvar %>% rename(gene=Gene)
credset_gwas_ukbpqtl2 <- credset_gwas_ukbpqtl2 %>% rename(gene=PROTEIN)
credset_gwas_scallop2 <- credset_gwas_scallop2 %>% rename(gene=Gene)
credset_gwas_raredis2 <- credset_gwas_raredis2 %>% rename(gene=Symbol)
credset_gwas_mko2 <- credset_gwas_mko2 %>% rename(gene=Symbol)
credset_gwas_pops2 <- credset_gwas_pops2 %>% rename(gene=gene_symbol)
v2g_all <- eqtl_all %>%
full_join(credset_gwas_ng2,by=col_for_join) %>%
full_join(fantom5_inscores_clinvar,by=col_for_join) %>%
full_join(credset_gwas_ukbpqtl2,by=col_for_join) %>%
full_join(credset_gwas_scallop2,by=col_for_join) %>%
full_join(credset_gwas_raredis2,by=col_for_join) %>%
full_join(credset_gwas_mko2,by=col_for_join) %>%
full_join(credset_gwas_pops2,by=col_for_join) %>% select(all_of(col_for_join)) %>% arrange(chr,posb37,locus,gene,evidence) %>% unique()
v2g_minimal <- v2g_all %>% select(locus,snpid,chr,posb37,gene) %>% select(locus,gene) %>% unique()
#Save each results for each analysis into a tables to populate a xlsx file:
df_list <- list(v2g_minimal,v2g_all,credset_gwas_ng2,fantom5_inscores_clinvar,credset_gwas_gtex2,credset_gwas_eqtlgen2,
credset_gwas_ubclung2,credset_gwas_ukbpqtl2,credset_gwas_scallop2,credset_gwas_raredis2,credset_gwas_mko2,credset_gwas_pops2)
write_xlsx(df_list,path = "src/report/var2gene_full.xlsx", col_names = TRUE, format_headers = TRUE)
##Rare variant ExWAS: (Single rare-variant and Gene-collpasing rare variant): no genes/results form these analyses
src/Region_plot_V2G.sh
#!/bin/bash
#SBATCH --job-name=regionplot
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=1:0:0
#SBATCH --mem=30gb
#SBATCH --account=gen1
#SBATCH --export=NONE
#Rationale: visualise V2G results in the different credible sets locus- R2 with regards to SNP with highest average PIP.
module load R
#workdir: Var_to_Gene/
PATH_data="/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data"
#mkdir /scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot
PATH_TMP="/scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot"
##digest FAVOR files:
#awk -F "," '{print $1, $2, $3, $8, $9, $10, $11, $12}' input/FAVOR_credset_chrpos38_2023_08_08.csv \
# > ${PATH_TMP}/FAVOR_credset_annotations_digest_08_08_23.csv
##Calculate R2 with respect to variant with highest PIP in each locus:
##Use the 10,000 European ancestry individuals in /data/gen1/LF_HRC_transethnic/LD_reference/EUR_UKB:
##find all credible set variants in EUR_UKB by chr and pos (alleles can be flipped):
#awk '{print $3" "$3"_"$4"_"}' /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt | \
# sed 's/ //g' | \
# grep -F -f - /data/gen1/LF_HRC_transethnic/LD_reference/EUR_UKB/ukb_imp_chr*_EUR_selected_nodups.bim | \
# awk '{print $2}' > ${PATH_TMP}/cs_variants_EUR_UKB
##find the SNPs in EUR_UKB by chr and pos (alleles can be flipped):
#awk '{print $2}'input/highest_PIP_sentinels | \
# awk -F "_" '{print $1, $1"_"$2"_"}' | sed 's/ / /g' | \
# grep -F -f - /data/gen1/LF_HRC_transethnic/LD_reference/EUR_UKB/ukb_imp_chr*_EUR_selected_nodups.bim | \
# awk '{print $2}' > ${PATH_TMP}/highest_PIP_sentinels_EUR_UKB
for line in {1..17}
do
SNP=$(awk -v row="$line" ' NR == row {print $0 } ' ${PATH_TMP}/highest_PIP_sentinels_EUR_UKB)
chr=$(awk -F "_" -v row="$line" ' NR == row {print $1 } ' ${PATH_TMP}/highest_PIP_sentinels_EUR_UKB)
SNP_tmp=$(awk -F "_" -v row="$line" ' NR == row {print $1"_"$2 } ' ${PATH_TMP}/highest_PIP_sentinels_EUR_UKB)
start=$(grep ${SNP_tmp}"_" input/highest_PIP_sentinels | awk '{print $1}' | awk -F "_" '{print $(NF-1)}')
end=$(grep ${SNP_tmp}"_" input/highest_PIP_sentinels | awk '{print $1}' | awk -F "_" '{print $(NF)}')
#create R2 according to leading p-value:
#Calculate R2 with respect to the leading SNP:
#module unload plink2/2.00a
#module load plink
#plink --bfile /data/gen1/LF_HRC_transethnic/LD_reference/EUR_UKB/ukb_imp_chr${chr}_EUR_selected_nodups \
# --chr ${chr} --from-bp ${start} --to-bp ${end} \
# --allow-no-sex --r2 inter-chr --ld-snp ${SNP} --ld-window-r2 0 \
# --out ${PATH_TMP}/${SNP}_${start}_${end}_ld_file
Rscript src/Region_plot_V2G_2.R \
${PATH_TMP}/${SNP}_${start}_${end}_ld_file.ld \
output/region_plots_V2G/rp_v2g_${chr}_${SNP}_${start}_${end}.pdf \
${chr}_${SNP}_${start}_${end} \
${start} ${end} ${chr} ${SNP}
done
src/Region_plot_V2G.R
#!/usr/bin/env Rscript
#Rationale: plot of Genomic Locus and variant-to-gene mapping genes:
##variants in credible set, with lead variant as the one with highest PIP
##functional annotation for only credible set variants, using as by FAVOR
##R2 between variants in the region
##Gene highlighted by the variant-to-gene mapping analysis with the type of evidence
library(tidyverse)
library(data.table)
library(dplyr)
annot <- read.table("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/FAVOR_credset_chrpos38_2023_08_08.csv", sep=",", stringsAsFactors = FALSE, fill=TRUE, header=TRUE)
annot <- annot %>% select("Variant..VCF.","Chromosome","Position","Genecode.Comprehensive.Category") %>%
rename(id="Variant..VCF.",chromosome="Chromosome",posb38="Position")
annot <- as.data.frame(sapply(annot, function(x) gsub("\"", "", x)))
annot <- annot %>% rename(Functional_annotation=Genecode.Comprehensive.Category)
#liftover online:
#awk -F "_" '{print "chr"$1,$2,$2}' /scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot/cs_variants_EUR_UKB > /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/cs_variants_EUR_UKB_lifover_input
credset <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot/cs_variants_EUR_UKB",header=FALSE)
credset <- credset %>% separate(V1, "_", into=c("chromosome", "posb37", "allele1", "allele2"))
b38 <- fread("/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/cs_variants_EUR_UKB_liftover_output.bed",header=FALSE)
b38 <- b38 %>% separate(V4, "-", into=c("chr_pos", "posb37"))
b38$chr_pos <- NULL
b38$posb37 <- as.numeric(b38$posb37)
credset$V1 <- paste0("chr",credset$chromosome)
credset$posb37 <- as.numeric(credset$posb37)
credset_b38 <- left_join(credset,b38,by=c("V1","posb37")) %>% select(-V1, -V3, -V5)
credset_b38 <- credset_b38 %>% rename(posb38=V2)
credset_b38$chromosome <- as.numeric(credset_b38$chromosome)
annot$chromosome <- as.numeric(annot$chromosome)
credset_b38$posb38 <- as.numeric(credset_b38$posb38)
annot$posb38 <- as.numeric(annot$posb38)
credset_annot <- inner_join(credset_b38, annot, by=c("chromosome","posb38"))
credset_annot$credset <- "1"
gwas <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat") %>% rename(chromosome=b37chr,posb37=bp)
gwas_credset_annot <- left_join(gwas,credset_annot,by=c("chromosome","posb37"))
gwas_credset_annot <- gwas_credset_annot %>% select(chromosome,posb37,pval,a1,a2,Functional_annotation,credset,snpid)
gwas_credset_annot <- gwas_credset_annot %>% mutate(Functional_annotation=ifelse(is.na(gwas_credset_annot$Functional_annotation), "unknown",gwas_credset_annot$Functional_annotation))
gwas_credset_annot$Functional_annotation <- as.factor(gwas_credset_annot$Functional_annotation)
gwas_credset_annot <- gwas_credset_annot %>% mutate(credset=ifelse(is.na(gwas_credset_annot$credset), 0, gwas_credset_annot$credset))
fwrite(gwas_credset_annot,"/scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot/gwas_credset_annot",quote=F,sep="\t",row.names=F,col.names=T)
src/Region_plot_V2G_2.R
#!/usr/bin/env Rscript
#Rationale: plot of fine-mapping PIP with functional annotation for variants in credible set:
library(tidyverse)
library(data.table)
library(dplyr)
library(RColorBrewer)
library(cowplot)
library(readxl)
args <- commandArgs(T)
r2_file <- args[1]
locus <- as.character(args[2])
locus_ggtitle <- as.character(args[3])
start <- as.numeric(args[4])
end <- as.numeric(args[5])
chr <- args[6]
snp_lead <- as.character(args[7])
print(start)
print(end)
print(chr)
print(snp_lead)
df <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot/gwas_credset_annot") %>% select(-snpid)
df <- df %>% mutate(snpid = paste0(chromosome,"_",posb37,"_",pmin(a1,a2),"_",pmax(a1,a2)))
df_sugg <- df %>% filter(pval <= 0.000005)
r2 <- fread(r2_file) %>% select(CHR_B,BP_B,SNP_B,R2)
colnames(r2) <- c("chromosome","posb37","snpid","R2")
df_r2 <- left_join(df,r2,by=c("chromosome","posb37","snpid"))
df_r2 <- df_r2 %>% mutate(Functional_annotation=ifelse(is.na(df_r2$Functional_annotation), "unknown",df_r2$Functional_annotation))
df_r2$Functional_annotation <- as.factor(df_r2$Functional_annotation)
df_r2$logP <- -log10(df$pval)
#for R2 color gradient in the plot:
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
sc <- scale_colour_gradientn(colours = myPalette(100), limits=c(0, 1))
##Plot with functional annotation and R2:
annot_r2 <- df_r2 %>% ggplot(aes(posb37, logP, shape = Functional_annotation, colour = R2, label = snpid)) +
geom_point(size=2) +
scale_shape_manual(values=c("downstream"=15, "exonic"=16, "intergenic"=17, "intronic"=18, "ncRNA_exonic"=19, "ncRNA_intronic"=10, "unknown"=11,"upstream;downstream"=12, "UTR3"=13, "UTR5"=14)) +
geom_point(data=df_r2[df_r2$credset==1,], pch=21, fill=NA, size=4, colour="red", stroke=1, alpha=0.5) +
geom_text(aes(label=ifelse(R2>0.95,as.character(snpid),'')),hjust=0,vjust=0,size=2.5,colour="black") +
theme_minimal() +
theme(legend.position = "right") + sc +
ggtitle(paste0(locus_ggtitle," (snp lead:", snp_lead,")")) + theme(plot.title = element_text(hjust=0.5)) + xlim(start,end) + ylab("-log10(pvalue)")
##Plot with functional annotation, R2 and gene location:
#Treated the gene plot as a gantt chart plot !
#prioritised genes and evidence:
l2g <- read_excel("src/report/locus2gene.xlsx",sheet = "L2G_clean") %>% filter(grepl(end,locus)) %>% select(gene)
#This gene list is from the R package LocusZooms
genes <- fread("/home/n/nnp5/software/LocusZooms/Gencode_GRCh37_Genes_UniqueList2021.txt")
#filter gene in the locus:
genes <- genes %>% filter(Chrom==paste0("chr",chr), Start >= start, End <= end)
## Set factor level to order the genes on the plot
genes$Gene <- as.factor(genes$Gene)
plot_gantt_gene <- qplot(ymin = Start,
ymax = End,
x = Gene,
colour = Coding,
geom = "linerange",
data = genes,
size = I(5)) +
scale_colour_manual(values = c("lncRNA"="burlywood3", "ncRNA"="aquamarine4", "proteincoding"="chocolate", "pseudogene"="chocolate4")) +
coord_flip() +
theme_bw() +
theme(panel.grid = element_blank()) +
ylab("posb37") +
xlab("genes") +
ylim(start,end) + labs(title = paste0("V2G identified genes: ",l2g))
plot_grid(annot_r2 + theme(legend.justification = c(0,1)), plot_gantt_gene + theme(legend.justification = c(0,1)), ncol=1, align='v', rel_heights = c(2,1))
ggsave(locus, width = 50, height = 60, units = "cm")
I identified a total of 98 genes. For each analysis:
Nearest gene identified 18 genes;
Functional annotation identified 40 genes (FANTOM 14 genes, Integrative scores 37 genes, ClinVar 6 genes);
eQTL: GTExV8 8 genes, eQTLgen blood 5 genes, UBC Lung eQTL 3 genes;
pQTL: UK Biobank 4 genes, SCALLOP 2 gene (no gene for deCODE);
PoPS 16 genes;
Mouse knock out 32 genes;
Rare disease 6 genes;
No genes were found from the look-up analyses in the exome single or
gene-based rare variant analyses. Two variants
(chr6_45913842_C_G,chr7_1489310_G_A) were significant, but not in the
sentinel regions; they map to genes TTK and INTS1 respectively.
Full variant-to-gene results available here:
Out of the 99 genes, 24 genes were supported by at least 2 criteria (Figure 2):
16 genes by two criteria: AC034102.4,AC034114.2,AC044784.1,AC116366.3,CAMK4,HDAC7,HLA-DQA1,HLA-DRB1,HLA-DRB5,IKZF3,IL33,NR1D1,ORMDL3,PGAP3,PPP1R1B,SUOX
6 genes by three criteria: D2HGDH,IL4R,NSMCE1,RORA,RPS26,SMAD3
2 genes by four criteria: BACH2, IL1RL1
## Locus-to-gene Below, the region-plots with linkage-disequilibrium
measures wth regards to the variant with highest posterior inclusion
probability as by the fine-mapping analysis with gene-mapped in the
genomic locus.